setup:
installing
here::i_am("notebooks/aale_resubmission_2021.nb.Rmd")
system('sudo apt-get update -y && sudo apt-get install -y libxt-dev')
# https://stackoverflow.com/questions/23642353/error-message-installing-cairo-package-in-r
# Enables `Cairo` installation, ComplexHeatmap dependency
system(paste0('sudo cp ',
here::here('install_bin/bwtool'),
' /usr/local/bin/'))
system('sudo apt install -y mysql-server')
system('sudo apt-get install -y libmagick++-6.q16-dev')
system('sudo apt-get install -y libpoppler-cpp-dev')
BiocManager::install(c("DESeq2",
"bamsignals",
"rGREAT",
"MotifDb",
"ComplexHeatmap"), update = FALSE, ask = FALSE)
remotes::install_github(repo = "poisonalien/trackplot", upgrade = F)
install.packages(c('magick', 'cowplot', 'Hmisc', 'pdftools', 'svglite'))
loading
suppressPackageStartupMessages({
library(tidyverse)
library(here)
library(ggsci)
library(ggpubr)
library(ggsignif)
library(ggthemes)
library(ggrepel)
library(ggforce)
library(ggpmisc)
library(cowplot)
library(patchwork)
library(extrafont)
library(ggrepel)
library(pheatmap)
library(viridis)
library(patchwork)
library(fgsea)
library(msigdbr)
library(matrixStats)
library(tximeta)
library(SummarizedExperiment)
library(rjson)
library(HDF5Array)
library(DESeq2)
library(survival)
library(survminer)
})
suppressMessages(loadfonts())
theming:
setting ggplot theme
base_size = 10
theme_set(theme_foundation(base_size = base_size,
base_family = "Helvetica") +
theme(plot.title = element_text(size = rel(0.95), face = 'bold'),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.line = element_line(),
axis.line.x = NULL,
axis.line.y = NULL,
axis.text = element_text(size = rel(0.8)),
axis.title = element_text(size = rel(0.9)),
axis.text.x = element_text(margin = margin(t = 0.8 * base_size/4), size = rel(0.8)),
axis.text.x.top = element_text(margin = margin(b = 0.8 * base_size/4), vjust = 0),
axis.text.y = element_text(margin = margin(r = 0.5 * base_size/4), hjust = 1, size = rel(0.8)),
axis.text.y.right = element_text(margin = margin(l = 0.5 * base_size/4), hjust = 0),
axis.ticks = element_line(),
axis.ticks.length = unit(base_size/2.5, "pt"), axis.ticks.length.x = NULL,
axis.ticks.length.x.top = NULL, axis.ticks.length.x.bottom = NULL,
axis.ticks.length.y = NULL, axis.ticks.length.y.left = NULL,
axis.ticks.length.y.right = NULL,
strip.text = element_text(size = rel(0.8), face = 'bold'),
strip.background = element_blank(),
legend.key.size= unit(0.08, "in"),
legend.spacing = unit(0, "in"),
legend.key = element_rect(colour = NA),
legend.title = element_text(face="bold", size = rel(0.8)),
legend.text = element_text(size = rel(0.75)),
legend.justification = c("right", "top"),
legend.box.just = "right",
legend.margin = margin(1.016, 1.016, 1.016, 1.016, unit = 'mm'),
plot.margin=margin(2.032,
2.032,
2.032,
2.032,
unit = "mm"),
plot.tag = element_text(size = rel(0.95), face = 'bold'),
plot.tag.position = c(0.01, 0.99),
legend.box.spacing = unit(-0.508, 'mm'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
))
theme_set_fun <- function() {
theme_set(theme_foundation(base_size = base_size,
base_family = 'Arial') +
theme(plot.title = element_text(size = rel(0.95), face = 'bold'),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.line = element_line(),
axis.line.x = NULL,
axis.line.y = NULL,
axis.text = element_text(size = rel(0.8)),
axis.title = element_text(size = rel(0.9)),
axis.text.x = element_text(margin = margin(t = 0.8 * base_size/4), size = rel(0.8)),
axis.text.x.top = element_text(margin = margin(b = 0.8 * base_size/4), vjust = 0),
axis.text.y = element_text(margin = margin(r = 0.5 * base_size/4), hjust = 1, size = rel(0.8)),
axis.text.y.right = element_text(margin = margin(l = 0.5 * base_size/4), hjust = 0),
axis.ticks = element_line(),
axis.ticks.length = unit(base_size/2.5, "pt"), axis.ticks.length.x = NULL,
axis.ticks.length.x.top = NULL, axis.ticks.length.x.bottom = NULL,
axis.ticks.length.y = NULL, axis.ticks.length.y.left = NULL,
axis.ticks.length.y.right = NULL,
strip.text = element_text(size = rel(0.8), face = 'bold'),
strip.background = element_blank(),
legend.key.size= unit(0.08, "in"),
legend.spacing = unit(0, "in"),
legend.key = element_rect(colour = NA),
legend.title = element_text(face="bold", size = rel(0.8)),
legend.text = element_text(size = rel(0.75)),
legend.justification = c("right", "top"),
legend.box.just = "right",
legend.margin = margin(1.016, 1.016, 1.016, 1.016, unit = 'mm'),
plot.margin=margin(2.032,
2.032,
2.032,
2.032,
unit = "mm"),
plot.tag = element_text(size = rel(0.95), face = 'bold'),
plot.tag.position = c(0.01, 0.99),
legend.box.spacing = unit(-0.508, 'mm'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
))
}
txt.mm_to_pts <- function(ratio){ base_size * ratio * (25.4 / 72.27) } # for reverse can use .pt
saving panels
panel.figure.width = 85
panel.figure.width.medium = 114
panel.figure.width.double = 17
panel.figure.height = 279.4
patch_fig_width = unit(172, 'mm')
patch_fig_height = unit(279.4 * 0.9 , 'mm')
gene sets for filtering, identification
if (! file.exists(file.path(output.data.dir, 'meta_data_df.rds'))) {
meta_data <- lst()
meta_data$te.clades <-
read_csv(file.path(output.data.dir, 'reference.data', 'te.clades.csv'))
meta_data$te.families.clades <-
read_csv(file.path(output.data.dir, 'reference.data','te.family.clade.csv'),
col_names = c('gene', 'family', 'clade')) %>%
distinct()
meta_data$trono.krab.znfs <-
read_csv(file.path(output.data.dir, 'krab.znf.trono.list.csv'),
col_names = c('gene', 'biotype', 'ensg', 'chr', 'start', 'end', 'strand', 'ZNF'))
meta_data$gen.24.names <-
read_csv(file.path(output.data.dir, 'reference.data/gen.24.names.csv'),
col_names = c('enst','ensg','tx','gene','len','biotype')) %>%
select(ensg, gene) %>%
distinct()
meta_data$gencode.35.tx.to.gene <-
read_csv(file.path(reference.dir, 'gencode.v35.tx.to.gene.csv'),
col_names = c('tx', 'gene')) %>%
separate(tx, sep = '\\|', into = c('enst','ensg',NA,NA,'tx','gene','len','biotype'))
meta_data$gencode.35.tx.to.gene %>%
dplyr::select(-enst) %>%
group_by(biotype) %>%
tally() -> gen.35.biotype.count
meta_data$gencode.35.lnc.gene.names <-
read_csv(file.path(reference.dir, 'gencode.v35.lncRNA.gene.names.txt'),
col_names = 'gene')
meta_data$ucsc.rmsk.tx.to.gene <-
read_csv(file.path(reference.dir, 'gencode.v35.ucsc.rmsk.tx.to.gene.csv'),
col_names = c('enst', 'gene')) %>%
filter(grepl('^hg38', enst))
ucsc.rmsk.tx.to.gene %>%
mutate(tx = enst) %>%
separate(tx, sep = '_', into = c(NA, NA, 'gene', 'position', NA, NA, 'strand', NA)) %>%
mutate(position = sub('range=', '', position),
strand = sub('strand=', '', strand)) %>%
merge(te.families.clades, by = 'gene') %>%
mutate(tmp.gene = gene,
ensg = NA) %>%
dplyr::select(enst, ensg, 'gene' = tmp.gene, 'len' = clade, 'biotype' = family) -> meta_data$ucsc.rmsk.tx.to.gene.fmt
meta_data$tot.gencode.35.tx.to.gene <-
read_csv(file.path(reference.dir, 'gencode.v35.tx.to.gene.csv'),
col_names = c('tx', 'gene')) %>%
mutate(enst = tx) %>%
separate(tx, sep = '\\|', into = c(NA,'ensg',NA,NA,'tx','gene','len','biotype'))
meta_data$total.tx.to.gene <-
rbind(tot.gencode.35.tx.to.gene %>% dplyr::select(enst, ensg, gene, len, biotype),
ucsc.rmsk.tx.to.gene.fmt %>% dplyr::select(enst = enst, ensg, gene, len, biotype))
meta_data$panc_methylation_df <- read_tsv(file.path(output.data.dir, 'GSE119548_avg_beta.txt.gz'))
saveRDS(meta_data, file.path(output.data.dir, 'meta_data_df.rds'))
} else {
meta.data <- readRDS(file.path(output.data.dir, 'meta_data_df.rds'))
}
processed dataset check:
build_salmon.quant <- FALSE
if (file.exists(file.path(output.data.dir, 'salmon.quant_analysis_set.rds'))) {
salmon.quant <- readRDS(file = file.path(output.data.dir, 'salmon.quant_analysis_set.rds'))
build_salmon.quant <- FALSE
} else {
message('Warning: salmon published dataset not found, will generate equivalent')
build_salmon.quant <- TRUE
}
build_ucsc.rmsk.salmon <- FALSE
if (file.exists(file.path(output.data.dir, 'ucsc.rmsk.salmon.quant_analysis_set.rds'))) {
ucsc.rmsk.salmon.quant <- readRDS(file = file.path(output.data.dir, 'ucsc.rmsk.salmon.quant_analysis_set.rds'))
build_ucsc.rmsk.salmon <- FALSE
} else {
message('Warning: ucsc rmsk salmon published dataset not found, will generate equivalent')
build_ucsc.rmsk.salmon <- TRUE
}
eval_build_datasets <- ifelse(build_ucsc.rmsk.salmon == F &
build_salmon.quant == F, FALSE, TRUE)
feature engineering:
FUN: construct.quant.object
load.tximeta.object <- function(reference){
## RERCOMM
print(file.path(output.data.dir, paste0(reference, '_h5_se')))
txi <- loadHDF5SummarizedExperiment(dir=file.path(output.data.dir, paste0(reference, '_h5_se')))
gxi <- loadHDF5SummarizedExperiment(dir=file.path(output.data.dir, paste0(reference, '_gene_h5_se')))
if (reference == 'process.aware.salmon'){
gxi.split <- loadHDF5SummarizedExperiment(dir=file.path(output.data.dir, paste0(reference, '_gene_split_h5_se')))
return(list('txi' = txi, 'gxi' = gxi, 'gxi.split' = gxi.split))
} else {
return(list('txi' = txi, 'gxi' = gxi))
}
}
subset.tximeta.object <- function(tximeta.list, grep.string){
## RERCOMM
if (grepl('\\|', grep.string)){
subset.txi <- SummarizedExperiment::subset(tximeta.list$txi,
select = !grepl(grep.string, colData(tximeta.list$txi)$names))
subset.gxi <- SummarizedExperiment::subset(tximeta.list$gxi,
select = !grepl(grep.string, colData(tximeta.list$gxi)$names))
} else{
subset.txi <- SummarizedExperiment::subset(tximeta.list$txi,
select = grepl(grep.string, colData(tximeta.list$txi)$names))
subset.gxi <- SummarizedExperiment::subset(tximeta.list$gxi,
select = grepl(grep.string, colData(tximeta.list$gxi)$names))
}
return(list('txi' = subset.txi,
'gxi' = subset.gxi))
}
CALL: construct.quant.object
if (build_salmon.quant == T) {
salmon.quant <- load.tximeta.object('salmon')
salmon.quant$aale$intra <- subset.tximeta.object(salmon.quant, 'intra.aale')
salmon.quant$hbec$hbec_wt <- subset.tximeta.object(salmon.quant, 'wt2_kras')
salmon.quant$hbec$hbec_v12 <- subset.tximeta.object(salmon.quant, '_v12')
salmon.quant$hbec$lacz_v_wt <- subset.tximeta.object(salmon.quant, '_v12|mut2|aale|ZNF|Empty')
salmon.quant$hbec$lacz_v_v12 <- subset.tximeta.object(salmon.quant, '_wt|mut2|aale|ZNF|Empty')
salmon.quant$a549 <- subset.tximeta.object(salmon.quant, 'intra.a549')
salmon.quant$all_intra <- subset.tximeta.object(salmon.quant, 'rnaEASY|hold')
salmon.quant$aale$exo <- subset.tximeta.object(salmon.quant, 'rnaEASY.aale')
salmon.quant$aale$kras_compare <- subset.tximeta.object(salmon.quant, 'aale.*.kras')
salmon.quant$aale$ctrl_compare <- subset.tximeta.object(salmon.quant, 'aale.*.ctrl')
salmon.quant$aale$platform_compare <- subset.tximeta.object(salmon.quant, 'aale')
colData(salmon.quant$gxi) %>%
as.data.frame() %>%
remove_rownames() -> salmon.quant$meta.data.df
rowRanges(salmon.quant$gxi) %>%
as.data.frame() %>%
dplyr::rename('chr' = seqnames) %>%
merge(meta.data$gencode.35.tx.to.gene %>% select(ensg, gene) %>% distinct(), by.x = 'gene_id', by.y = 'ensg') ->
salmon.quant$gxi.bed
rowRanges(salmon.quant$txi) %>%
as.data.frame() %>%
dplyr::rename('chr' = seqnames) %>%
merge(meta.data$gencode.35.tx.to.gene %>% select(enst, gene) %>% distinct(), by.x = 'tx_name', by.y = 'enst') ->
salmon.quant$txi.bed
}
# ------------------------------------------------------------------------------
if (build_ucsc.rmsk.salmon == T) {
ucsc.rmsk.salmon.quant <- load.tximeta.object('ucsc.rmsk.salmon')
ucsc.rmsk.salmon.quant$aale$intra <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, 'intra.aale')
ucsc.rmsk.salmon.quant$hbec$hbec_wt <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, 'wt2_kras')
ucsc.rmsk.salmon.quant$hbec$hbec_v12 <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, '_v12')
ucsc.rmsk.salmon.quant$hbec$lacz_v_wt <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, '_v12|mut2|aale|ZNF|Empty')
ucsc.rmsk.salmon.quant$hbec$lacz_v_v12 <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, '_wt|mut2|aale|ZNF|Empty')
ucsc.rmsk.salmon.quant$a549 <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, 'intra.a549')
ucsc.rmsk.salmon.quant$all_intra <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, 'rnaEASY|hold')
ucsc.rmsk.salmon.quant$aale$exo <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, 'rnaEASY.aale')
ucsc.rmsk.salmon.quant$aale$kras_compare <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, 'aale.*.kras')
ucsc.rmsk.salmon.quant$aale$ctrl_compare <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, 'aale.*.ctrl')
ucsc.rmsk.salmon.quant$aale$platform_compare <-
subset.tximeta.object(ucsc.rmsk.salmon.quant, 'aale')
colData(ucsc.rmsk.salmon.quant$gxi) %>%
as.data.frame() %>%
remove_rownames() -> ucsc.rmsk.salmon.quant$meta.data.df
}
FUN: run.de.seq
run.de.seq <- function(tximeta.object, base_level = 'ctrl'){
tximeta.name <- deparse(substitute(tximeta.object))
print(tximeta.name)
colData(tximeta.object$gxi) %>%
as.data.frame() %>%
remove_rownames() %>%
mutate(condition = relevel(as.factor(condition), base_level)) -> colData.gxi
print(levels(colData.gxi$condition))
print(colData.gxi)
count.matrix.gxi <- assay(tximeta.object$gxi, 'counts') %>%
as.data.frame() %>%
rownames_to_column('gene') %>%
mutate_if(is.numeric, round) %>%
column_to_rownames('gene')
colData(tximeta.object$txi) %>%
as.data.frame() %>%
remove_rownames() -> colData.txi
count.matrix.txi <- assay(tximeta.object$txi, 'counts') %>%
as.data.frame() %>%
rownames_to_column('tmp') %>%
mutate_if(is.numeric, round) %>%
column_to_rownames('tmp')
if (grepl('compare', tximeta.name)) {
colData.gxi$platform <- relevel(as.factor(colData.gxi$platform), 'intra')
dds.gxi <- DESeqDataSetFromMatrix(countData = count.matrix.gxi,
colData = colData.gxi,
design = as.formula(~platform))
dds.txi <- DESeqDataSetFromMatrix(countData = count.matrix.txi,
colData = colData.txi,
design = as.formula(~platform))
} else {
dds.gxi <- DESeqDataSetFromMatrix(countData = count.matrix.gxi,
colData = colData.gxi,
design = as.formula(~condition))
dds.txi <- DESeqDataSetFromMatrix(countData = count.matrix.txi,
colData = colData.txi,
design = as.formula(~condition))
}
dds.txi <- estimateSizeFactors(dds.txi)
dds.txi.norm.counts <- as.data.frame(counts(dds.txi, normalized=T))
dds.txi.vst.counts <- as.data.frame(assay(vst(dds.txi, blind = F)))
dds.gxi <- estimateSizeFactors(dds.gxi)
dds.gxi.norm.counts <- as.data.frame(counts(dds.gxi, normalized=T))
dds.gxi.vst.counts <- as.data.frame(assay(vst(dds.gxi, blind = F)))
if (grepl('all_intra|a549', tximeta.name)){
print('no DE calc')
if (grepl('ucsc.rmsk', tximeta.name)){
print('rmsk')
dds.gxi.norm.counts <- dds.gxi.norm.counts %>%
rownames_to_column('gene')
return(list('counts' = dds.gxi.norm.counts,
'txi.counts' = dds.txi.norm.counts,
'gxi.vst.counts' = dds.gxi.vst.counts,
'col.data' = colData.txi))
quit()
}
dds.gxi.norm.counts <-
dds.gxi.norm.counts %>%
rownames_to_column('gene_id') %>%
merge(salmon.quant$gxi.bed, by = 'gene_id') %>%
distinct()
dds.txi.norm.counts <-
dds.gxi.norm.counts %>%
rownames_to_column('tx_name') %>%
merge(salmon.quant$txi.bed, by = 'tx_name') %>%
distinct()
return(list('counts' = dds.gxi.norm.counts,
'txi.counts' = dds.txi.norm.counts,
'gxi.vst.counts' = dds.gxi.vst.counts,
'col.data' = colData.txi))
quit()
}
filter <- rowSums(counts(dds.gxi, normalized=T) >= 10) >= 2
dds.gxi <- DESeq(dds.gxi[filter,])
print(resultsNames(dds.gxi))
coef = print(tail(resultsNames(dds.gxi), n=1))
if (grepl('ucsc.rmsk', tximeta.name)){
print('rmsk')
dds.gxi.lfc <- lfcShrink(dds.gxi, coef = coef) %>%
as.data.frame() %>%
# filter(padj <= 0.05) %>%
rownames_to_column('gene') %>%
dplyr::select(gene, baseMean, log2FoldChange, padj)
dds.gxi.norm.counts <- dds.gxi.norm.counts %>%
rownames_to_column('gene')
} else{
dds.gxi.lfc <- lfcShrink(dds.gxi, coef = coef) %>%
as.data.frame() %>%
# filter(padj <= 0.05) %>%
rownames_to_column('gene_id') %>%
merge(salmon.quant$gxi.bed, by = 'gene_id')
plotMA(lfcShrink(dds.gxi, coef = coef))
dds.gxi.norm.counts <-
dds.gxi.norm.counts %>%
rownames_to_column('gene_id') %>%
merge(salmon.quant$gxi.bed, by = 'gene_id') %>%
distinct()
dds.txi.norm.counts <-
dds.gxi.norm.counts %>%
rownames_to_column('tx_name') %>%
merge(salmon.quant$txi.bed, by = 'tx_name') %>%
distinct()
}
return(list('de' = dds.gxi.lfc,
'counts' = dds.gxi.norm.counts,
'txi.counts' = dds.txi.norm.counts,
'txi.vst.counts' = dds.txi.vst.counts,
'gxi.vst.counts' = dds.gxi.vst.counts,
'col.data' = colData.txi))
}
run.de.seq.A549.contrast <- function(tximeta.object, base_level = 'Empty'){
tximeta.name <- deparse(substitute(tximeta.object))
print(tximeta.name)
colData(tximeta.object$gxi) %>%
as.data.frame() %>%
remove_rownames() %>%
mutate(condition = relevel(as.factor(condition), base_level)) -> colData.gxi
print(levels(colData.gxi$condition))
print(colData.gxi)
count.matrix.gxi <- assay(tximeta.object$gxi, 'counts') %>%
as.data.frame() %>%
rownames_to_column('gene') %>%
mutate_if(is.numeric, round) %>%
column_to_rownames('gene')
colData(tximeta.object$txi) %>%
as.data.frame() %>%
remove_rownames() -> colData.txi
count.matrix.txi <- assay(tximeta.object$txi, 'counts') %>%
as.data.frame() %>%
rownames_to_column('tmp') %>%
mutate_if(is.numeric, round) %>%
column_to_rownames('tmp')
# colData.gxi$platform <- relevel(as.factor(colData.gxi$platform), 'intra')
dds.gxi <- DESeqDataSetFromMatrix(countData = count.matrix.gxi,
colData = colData.gxi,
design = as.formula(~condition))
# dds.txi <- DESeqDataSetFromMatrix(countData = count.matrix.txi,
# colData = colData.txi,
# design = as.formula(~platform))
dds.gxi <- estimateSizeFactors(dds.gxi)
filter <- rowSums(counts(dds.gxi, normalized=T) >= 10) >= 2
dds.gxi <- DESeq(dds.gxi[filter,])
print(resultsNames(dds.gxi))
# results(dds.gxi, contrast =list("conditionkras.platformrnaEASY"))
#
dds.682.gxi.lfc <- lfcShrink(dds.gxi, coef = "condition_ZNF682_vs_Empty") %>%
as.data.frame() %>%
# filter(padj <= 0.05) %>%
rownames_to_column('gene_id') #%>%
# merge(salmon.quant$gxi.bed, by = 'gene_id')
dds.257.gxi.lfc <- lfcShrink(dds.gxi, coef = "condition_ZNF257_vs_Empty") %>%
as.data.frame() %>%
# filter(padj <= 0.05) %>%
rownames_to_column('gene_id') #%>%
# merge(salmon.quant$gxi.bed, by = 'gene_id')
return(list('de_682' = dds.682.gxi.lfc,
'de_257' = dds.257.gxi.lfc,
'col.data' = colData.txi))
}
run.de.seq.contrast <- function(tximeta.object, base_level = 'ctrl'){
tximeta.name <- deparse(substitute(tximeta.object))
print(tximeta.name)
colData(tximeta.object$gxi) %>%
as.data.frame() %>%
remove_rownames() %>%
mutate(condition = relevel(as.factor(condition), base_level)) -> colData.gxi
print(levels(colData.gxi$condition))
print(colData.gxi)
count.matrix.gxi <- assay(tximeta.object$gxi, 'counts') %>%
as.data.frame() %>%
rownames_to_column('gene') %>%
mutate_if(is.numeric, round) %>%
column_to_rownames('gene')
colData(tximeta.object$txi) %>%
as.data.frame() %>%
remove_rownames() -> colData.txi
count.matrix.txi <- assay(tximeta.object$txi, 'counts') %>%
as.data.frame() %>%
rownames_to_column('tmp') %>%
mutate_if(is.numeric, round) %>%
column_to_rownames('tmp')
# colData.gxi$platform <- relevel(as.factor(colData.gxi$platform), 'intra')
# dds.gxi <- DESeqDataSetFromMatrix(countData = count.matrix.gxi,
# colData = colData.gxi,
# design = as.formula(~condition +
# platform +
# platform:condition))
dds.txi <- DESeqDataSetFromMatrix(countData = count.matrix.txi,
colData = colData.txi,
design = as.formula(~platform))
dds.gxi <- estimateSizeFactors(dds.gxi)
filter <- rowSums(counts(dds.gxi, normalized=T) >= 10) >= 2
dds.gxi <- DESeq(dds.gxi[filter,])
print(resultsNames(dds.gxi))
# results(dds.gxi, contrast =list("conditionkras.platformrnaEASY"))
#
results(dds.gxi, contrast =list("conditionkras.platformrnaEASY"))
ctrl.dds.gxi.lfc <- lfcShrink(dds.gxi, coef = "conditionkras.platformrnaEASY") %>%
as.data.frame() %>%
# filter(padj <= 0.05) %>%
rownames_to_column('gene_id') %>%
merge(salmon.quant$gxi.bed, by = 'gene_id')
ctrl.dds.gxi.lfc <- lfcShrink(dds.gxi, coef = "conditionctrl.platformrnaEASY") %>%
as.data.frame() %>%
# filter(padj <= 0.05) %>%
rownames_to_column('gene_id') %>%
merge(salmon.quant$gxi.bed, by = 'gene_id')
return(list('ctrl' = ctrl.dds.gxi.lfc,
'kras' = ctrl.dds.gxi.lfc,
'col.data' = colData.txi))
}
CALL: run.de.seq
if (build_salmon.quant == T) {
salmon.quant$aale$intra$deseq <- run.de.seq(salmon.quant$aale$intra, base_level = 'ctrl')
salmon.quant$aale$exo$deseq <- run.de.seq(salmon.quant$aale$exo, base_level = 'ctrl')
salmon.quant$aale$kras_compare$deseq <- run.de.seq(salmon.quant$aale$kras_compare, base_level = 'kras')
salmon.quant$aale$ctrl_compare$deseq <- run.de.seq(salmon.quant$aale$ctrl_compare, base_level = 'ctrl')
salmon.quant$aale$platform_compare$deseq <- run.de.seq(salmon.quant$aale$platform_compare, base_level = 'ctrl')
salmon.quant$hbec$lacz_v_v12$deseq <- run.de.seq(salmon.quant$hbec$lacz_v_v12, base_level = 'wt2_lacz')
salmon.quant$a549$deseq <- run.de.seq(salmon.quant$a549, base_level = 'Empty')
salmon.quant$all_intra$deseq <- run.de.seq(salmon.quant$all_intra)
}
# ------------------------------------------------------------------------------
if (build_ucsc.rmsk.salmon == T) {
ucsc.rmsk.salmon.quant$aale$intra$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$aale$intra, base_level = 'ctrl')
ucsc.rmsk.salmon.quant$aale$exo$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$aale$exo, base_level = 'ctrl')
ucsc.rmsk.salmon.quant$aale$kras_compare$deseq <-
run.de.seq(ucsc.rmsk.salmon.quant$aale$kras_compare, base_level = 'kras')
ucsc.rmsk.salmon.quant$aale$ctrl_compare$deseq <-
run.de.seq(ucsc.rmsk.salmon.quant$aale$ctrl_compare, base_level = 'ctrl')
ucsc.rmsk.salmon.quant$aale$platform_compare$deseq <-
run.de.seq(ucsc.rmsk.salmon.quant$aale$platform_compare, base_level = 'ctrl')
ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$hbec$lacz_v_v12, base_level = 'wt2_lacz')
ucsc.rmsk.salmon.quant$hbec$a549$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$a549, base_case = 'Empty')
ucsc.rmsk.salmon.quant$all_intra$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$all_intra)
saveRDS(ucsc.rmsk.salmon.quant, file = file.path(output.data.dir, 'ucsc.rmsk.salmon.quant_analysis_set.rds'))
}
CALL: a549 de seq
a549_de <- run.de.seq.A549.contrast(salmon.quant$a549, base_level = 'Empty')
a549_te_de <- run.de.seq.A549.contrast(ucsc.rmsk.salmon.quant$a549, base_level = 'Empty')
FUN: plot_pca
plot_pca <- function(counts, col.data) {
set.seed(234)
### Gencode PCA
counts %>% apply(., 1, sd) -> pca_sd
counts[!pca_sd == 0, ] -> pca_filt
col.data -> pca_col_data
pca_filt %>%
t() %>%
as.data.frame() %>%
prcomp(scale=T) -> pca_out_raw
pcs.of.interest <- apply(pca_out_raw$x, 2, var)
pcs.props <- pcs.of.interest/sum(pcs.of.interest)
# convert output to dataframe
pca.out <- as.data.frame(pca_out_raw$x) %>%
rownames_to_column('names') %>%
merge(pca_col_data, by ='names')
# summarize PC contributions
pca.out.summary <-
as.data.frame(summary(pca_out_raw)$importance) %>%
rownames_to_column('metric') %>%
gather(pc, value, -metric) %>%
spread(metric, value)
# summarize PC contributions
pca.out.contributions <-
as.data.frame(pca_out_raw$rotation) %>%
rownames_to_column('tx') %>%
gather(pc, contrib, -tx) %>%
#merge(gencode.reference,
# by.x = 'tx',
# by.y = 'gene.name') %>%
group_by(pc) %>%
mutate(contrib = abs(contrib)) %>%
filter(contrib > quantile(contrib, 0.95))
return(list('pca.out' = pca.out,
'pcs.props' = pcs.props))
}
CALL: plot_pca
if (build_salmon.quant == T) {
salmon.quant$aale$intra$deseq$pca <- plot_pca(salmon.quant$aale$intra$deseq$gxi.vst.counts,
salmon.quant$aale$intra$deseq$col.data)
salmon.quant$aale$exo$deseq$pca <- plot_pca(salmon.quant$aale$exo$deseq$gxi.vst.counts,
salmon.quant$aale$exo$deseq$col.data)
}
FUN: make.fgsea.ranks
make.fgsea.ranks <- function(de.seq){
# build a ranked gene list in the appropriate format (named list)
# using shrunken log2 Fold Change values from DESeqII
de.seq <-
de.seq %>%
mutate(rank = as.double(log2FoldChange),
Gene = as.character(gene)) %>%
dplyr::select(Gene, rank) %>%
mutate(rank = scale(rank))
de.seq.rnk <-
de.seq$rank
de.seq.rnk <-
setNames(de.seq$rank, de.seq$Gene)
# run the GSEA implementation on the hallmark sets supplemented with custom sets
print('fgsea')
de.seq.fgsea.res <-
fgsea(pathways = msig.df,
stats = de.seq.rnk, eps = 0.0)
print('padj filter and clean names')
# convert to a tidy format, clean the names for display
de.seq.fgsea.df <-
as_tibble(de.seq.fgsea.res) %>%
filter(padj < 0.05) %>%
mutate(pathway = gsub('_', ' ', pathway))
return(de.seq.fgsea.df)
}
CALL: make.fgsea.ranks
msig.df <-
msigdbr(species = 'Homo sapiens', category = 'H') %>%
dplyr::select(gene_symbol, gs_name) %>%
split(x = .$gene_symbol, f = .$gs_name)
msig.df$HALLMARK_KRAS_G12D_ZNF_DN <- salmon.quant$aale$intra$deseq$de %>%
filter(gene %in% filter(meta.data$trono.krab.znfs, ZNF == 'KRAB-ZNFs')$gene,
log2FoldChange <= -1) %>%
dplyr::select(gene) %>% unlist() %>% unname()
if (build_salmon.quant == T) {
salmon.quant$aale$intra$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$intra$deseq$de)
salmon.quant$aale$exo$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$exo$deseq$de)
salmon.quant$aale$kras_compare$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$kras_compare$deseq$de)
salmon.quant$aale$ctrl_compare$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$ctrl_compare$deseq$de)
salmon.quant$aale$platform_compare$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$platform_compare$deseq$de)
salmon.quant$hbec$hbec_wt$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$hbec$hbec_wt$deseq$de)
salmon.quant$hbec$hbec_v12$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$hbec$hbec_v12$deseq$de)
salmon.quant$hbec$lacz_v_wt$deseq$H.fgsea <-make.fgsea.ranks(salmon.quant$hbec$lacz_v_wt$deseq$de)
salmon.quant$hbec$lacz_v_v12$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$hbec$lacz_v_v12$deseq$de)
msig.df <-
msigdbr(species = 'Homo sapiens') %>%
dplyr::select(gene_symbol, gs_name) %>%
filter(!grepl('HALLMARK', gs_name)) %>%
split(x = .$gene_symbol, f = .$gs_name)
salmon.quant$aale$intra$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$intra$deseq$de)
salmon.quant$aale$exo$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$exo$deseq$de)
salmon.quant$aale$kras_compare$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$kras_compare$deseq$de)
salmon.quant$aale$ctrl_compare$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$ctrl_compare$deseq$de)
salmon.quant$aale$platform_compare$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$platform_compare$deseq$de)
salmon.quant$hbec$hbec_wt$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$hbec$hbec_wt$deseq$de)
salmon.quant$hbec$hbec_v12$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$hbec$hbec_v12$deseq$de)
salmon.quant$hbec$lacz_v_wt$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$hbec$lacz_v_wt$deseq$de)
salmon.quant$hbec$lacz_v_v12$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$hbec$lacz_v_v12$deseq$de)
saveRDS(salmon.quant, file = file.path(output.data.dir, 'salmon.quant_analysis_set.rds'))
}
generate FIMO output
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(BSgenome)
ifn_g <- msig.df$HALLMARK_INTERFERON_ALPHA_RESPONSE
ifn_a <- msig.df$HALLMARK_INTERFERON_GAMMA_RESPONSE
jak_stat <- msig.df$HALLMARK_IL6_JAK_STAT3_SIGNALING
options(meme_db = file.path(output.data.dir, 'reference.data',
'HOCOMOCOv11_core_HUMAN_mono_meme_format.meme'))
salmon.quant$aale$intra$deseq$de %>%
filter(padj < 0.05, log2FoldChange > 1.5, gene %in% c(ifn_g, ifn_a, jak_stat)) %>%
# filter(gene %in% filter(salmon.quant$hbec$lacz_v_v12$deseq$de, padj < 0.05, log2FoldChange > 0.5)$gene) %>%
select(gene_id) %>% unlist() %>% unname() -> de_list
salmon.quant$aale$intra$deseq$de %>%
filter(padj > 0.05, gene %in% c(ifn_g, ifn_a, jak_stat)) %>%
# filter(gene %in% filter(salmon.quant$hbec$lacz_v_v12$deseq$de, padj > 0.05, log2FoldChange > 0)$gene) %>%
select(gene_id) %>% unlist() %>% unname() -> bg_de_list
salmon.quant$aale$intra$deseq$de %>%
filter(padj < 0.05, log2FoldChange > 0, gene %in% c(ifn_g, ifn_a), !gene_id %in% c(de_list)) %>%
filter(gene %in% filter(salmon.quant$hbec$lacz_v_v12$deseq$de, padj < 0.05, log2FoldChange < 0)$gene) %>%
select(gene_id) %>% unlist() %>% unname() -> de_list_w_hbec
# salmon.quant$hbec$lacz_v_v12$deseq$de %>%
# filter(padj < 0.05, log2FoldChange > 0) %>% #, gene %in% c(ifn_g, ifn_a)) %>%
# filter(!gene %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 0)$gene) %>%
# select(gene_id) %>% unlist() %>% unname() -> de_list_w_hbec
gr <- rowRanges(salmon.quant$gxi)
ifn_de_gr <- gr[names(gr) %in% de_list]
ng_de_gr <- gr[names(gr) %in% bg_de_list]
promoter_seqs <-
GenomicFeatures::getPromoterSeq(subject = BSgenome.Hsapiens.UCSC.hg38,
query = ifn_de_gr, upstream = 500, downstream = 500)
promoter_seqs_hbec <-
GenomicFeatures::getPromoterSeq(subject = BSgenome.Hsapiens.UCSC.hg38,
query = ng_de_gr, upstream = 500, downstream = 500)
# dreme_results <- memes::runMeme(input = promoter_seqs, control = NA)
#
tomTom_res <- memes::runAme(control = promoter_seqs_hbec,
input = promoter_seqs)
tomTom_res %>%
mutate(tmp = motif_id) %>%
separate(tmp, sep = '_', into = c('TF', NA)) %>%
mutate(TF = str_replace(TF, pattern = '^ZN', replacement = 'ZNF'),
TF = str_replace(TF, pattern = 'ZNFF', replacement = 'ZNF'),
TF = str_replace(TF, pattern = '^NFAC', replacement = 'NFATC')) -> ame_tf
isre_motif <- MotifDb::query(MotifDb::MotifDb, "ISRE") %>%
universalmotif::convert_motifs()
isre_motif <- isre_motif[[1]]
irf9_motif <- MotifDb::query(MotifDb::MotifDb, "IRF9_HUMAN.H11MO.0.C") %>%
universalmotif::convert_motifs()
irf7_motif <- MotifDb::query(MotifDb::MotifDb, "IRF7_HUMAN.H11MO.0.C") %>%
universalmotif::convert_motifs()
irf1_motif <- MotifDb::query(MotifDb::MotifDb, "IRF1_HUMAN.H11MO.0.A") %>%
universalmotif::convert_motifs()
fos_motif <- MotifDb::query(MotifDb::MotifDb, "FOS") %>%
universalmotif::convert_motifs()
foxp2_motif <- MotifDb::query(MotifDb::MotifDb, 'FOXP2_HUMAN.H11MO.0.C') %>%
universalmotif::convert_motifs()
irf1_fimo_results <- memes::runFimo(promoter_seqs, irf1_motif, thresh = 1e-4) %>%
as.data.frame() %>%
merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id')
# select(gene) %>% distinct()
irf7_fimo_results <- memes::runFimo(promoter_seqs, irf7_motif, thresh = 1e-4) %>%
as.data.frame() %>%
merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id')
irf9_fimo_results <- memes::runFimo(promoter_seqs, irf9_motif, thresh = 1e-4) %>%
as.data.frame() %>%
merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id')
fos_fimo_results <- memes::runFimo(promoter_seqs, fos_motif, thresh = 1e-6) %>%
as.data.frame() %>%
merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id')
# select(gene) %>% distinct()
foxp2_fimo_results <- memes::runFimo(promoter_seqs, foxp2_motif, thresh = 1e-3) %>%
as.data.frame() %>%
merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id')
# select(gene) %>% distinct()
irf9_fimo_results <- memes::runFimo(promoter_seqs, irf9_motif, thresh = 1e-3) %>%
as.data.frame() %>%
merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id')
# select(gene) %>% distinct()
ATAC
datasets
kras_atac_anno_peaks <-
read_tsv(file.path(output.data.dir,
'narrow_output.data/bwa/mergedLibrary/macs/narrowPeak/kras_R1.mLb.clN_peaks.annotatePeaks.txt'))
ctrl_atac_anno_peaks <-
read_tsv(file.path(output.data.dir,
'narrow_output.data/bwa/mergedLibrary/macs/narrowPeak/ctrl_R1.mLb.clN_peaks.annotatePeaks.txt'))
consensus_atac_anno_peak_counts <-
read_tsv(file.path(output.data.dir,
'narrow_output.data/bwa/mergedLibrary/macs/narrowPeak/consensus/consensus_peaks.mLb.clN.featureCounts.txt'),
skip = 1)
consensus_atac_boolean_anno <-
read_tsv(file.path(output.data.dir,
'narrow_output.data/bwa/mergedLibrary/macs/narrowPeak/consensus/consensus_peaks.mLb.clN.boolean.annotatePeaks.txt'
))
parsed_datasets & GREAT
irf9_fimo_results %>%
mutate(start.x = start.y + start.x,
end.x = start.y + end.x) %>%
select(chr, start = start.x, end = end.x, strand = strand.x, gene, motif_id) %>%
unite(col = 'name', sep = '_', c(gene, motif_id)) %>%
makeGRangesFromDataFrame(keep.extra.columns = T) -> irf9_bed
consensus_atac_anno_peak_counts %>%
select(chr = Chr, start = Start, end = End, strand = Strand, name = Geneid) %>%
makeGRangesFromDataFrame(keep.extra.columns = T) -> consensus_bed
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == F) %>%
select(chr, start, end, name = interval_id) %>%
makeGRangesFromDataFrame(keep.extra.columns = T) -> kras_atac_bed
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == F) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
group_by(`Gene Name`) %>%
tally() -> uniq_kras_peak_genes
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == F) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 1.5)$gene) %>%
filter(`Gene Name` %in% c(ifn_g, ifn_a)) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
group_by(`Gene Name`) %>%
tally() -> uniq_ifn_kras_peak_genes
consensus_atac_boolean_anno %>%
filter(kras_R1.mLb.clN.bool == F) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
group_by(`Gene Name`) %>%
tally() -> uniq_ctrl_peak_genes
consensus_atac_boolean_anno %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
select(chr, start, end, gene = `Gene Name`) %>%
mutate(strand = '+') -> bg_peak_input_bed
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == F) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 1.5)$gene) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
select(chr, start, end, gene = `Gene Name`) %>%
mutate(strand = '+') -> uniq_kras_peak_input_bed
uniq_kras_great_job <- rGREAT::submitGreatJob(gr = uniq_kras_peak_input_bed, species = 'hg38', bg = bg_peak_input_bed)
uniq_kras_great_tb <- rGREAT::getEnrichmentTables(uniq_kras_great_job, download_by = 'tsv')
consensus_atac_boolean_anno %>%
filter(kras_R1.mLb.clN.bool == F) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange < -1.5)$gene) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
select(chr, start, end, gene = `Gene Name`) %>%
mutate(strand = '+') -> uniq_ctrl_peak_input_bed
uniq_ctrl_job <- rGREAT::submitGreatJob(uniq_ctrl_peak_input_bed, species = 'hg38', bg = bg_peak_input_bed)
uniq_ctrl_tb <- rGREAT::getEnrichmentTables(uniq_ctrl_job, download_by = 'tsv')
peak ranges
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == F) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 1.5)$gene) %>%
filter(`Gene Name` %in% c(ifn_g, ifn_a)) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
mutate(strand = '+') %>%
select(chr, start, end, strand, name = `Gene Name`) %>%
GenomicRanges::makeGRangesFromDataFrame() -> ifn_uniq_peak_ranges
names(ifn_uniq_peak_ranges) <- uniq_ifn_kras_peak_genes$`Gene Name`
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == T) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 0.5)$gene) %>%
filter(`Gene Name` %in% c(ifn_g, ifn_a), ! is.na(`Gene Name`), ! `Gene Name` %in% names(ifn_uniq_peak_ranges)) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
mutate(strand = '+') %>%
select(chr, start, end, strand, name = `Gene Name`) %>%
GenomicRanges::makeGRangesFromDataFrame() -> other_uniq_peak_ranges
names(other_uniq_peak_ranges) <- consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == T) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 0.5)$gene) %>%
filter(`Gene Name` %in% c(ifn_g, ifn_a), ! is.na(`Gene Name`), ! `Gene Name` %in% names(ifn_uniq_peak_ranges)) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
select(`Gene Name`) %>% unlist() %>% unname()
TE ranges
build_ranges_lists = TRUE
if (file.exists(file.path(output.data.dir, 'intra.aale.te_ranges_list.rds')) &
file.exists(file.path(output.data.dir, 'exo.aale.te_ranges_list.rds')) &
file.exists(file.path(output.data.dir, 'hbec.aale.te_ranges_list.rds'))) {
build_ranges_lists = FALSE
intra.aale.te_ranges_list <-
readRDS(file.path(output.data.dir, 'intra.aale.te_ranges_list.rds'))
exo.aale.te_ranges_list <-
readRDS(file.path(output.data.dir, 'exo.aale.te_ranges_list.rds'))
hbec.aale.te_ranges_list <-
readRDS(file.path(output.data.dir, 'hbec.aale.te_ranges_list.rds'))
}
TE filters
if (file.exists(file.path(output.data.dir, 'rmsk_parsed_bed.rds'))) {
rmsk_parsed_bed <- readRDS(file.path(output.data.dir, 'rmsk_parsed_bed.rds'))
} else {
meta.data$ucsc.rmsk.tx.to.gene.fmt %>%
select(enst, gene, biotype) %>%
separate(enst, sep = '_', into = c(NA, NA, 'gene', 'position', NA, NA, 'strand', NA)) %>%
mutate(position = sub('range=', '', position),
strand = sub('strand=', '', strand)) -> rmsk_bed
rmsk_bed %>%
filter(grepl(':', position)) %>%
separate(position, sep = ':', into = c('chr', 'range')) %>%
separate(range, sep = '-', into = c('start', 'end')) -> rmsk_parsed_bed
saveRDS(rmsk_parsed_bed, file.path(output.data.dir, 'rmsk_parsed_bed.rds'))
}
imap(list('aale_intra' = ucsc.rmsk.salmon.quant$aale$intra$deseq$de,
'hbec_intra' = ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq$de,
'aale_exo' = ucsc.rmsk.salmon.quant$aale$exo$deseq$de), function(x, x_name) {
# ), function(x, x_name) {
x %>%
filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
padj < 0.05, log2FoldChange > 0.65, !grepl('\\)n', gene)) %>%
mutate(context = x_name)
# }) %>% bind_rows() -> upreg_te
}) -> upreg_te
imap(list('aale_intra' = ucsc.rmsk.salmon.quant$aale$intra$deseq$de,
'h358_intra' = ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq$de,
'aale_exo' = ucsc.rmsk.salmon.quant$aale$exo$deseq$de), function(x, x_name) {
x %>%
filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
padj < 0.05, log2FoldChange < -1, !grepl('\\)n', gene)) %>%
mutate(context = x_name)
}) %>% bind_rows() -> dnreg_te
intra.rmsk.txi <- as.data.frame(assay(ucsc.rmsk.salmon.quant$aale$intra$txi, 'counts'))
exo.rmsk.txi <- as.data.frame(assay(ucsc.rmsk.salmon.quant$aale$exo$txi, 'counts'))
hbec.rmsk.txi <- as.data.frame(assay(ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$txi, 'counts'))
te_filter <- function(txi_df) {
rmsk.txi <- txi_df[!grepl('^ENST', rownames(txi_df)), ]
rmsk.txi %>%
rownames_to_column('enst') %>%
select(enst, contains('kras')) %>%
filter_if(is.numeric, any_vars(. > 2)) %>%
separate(enst, sep = '_', into = c(NA, NA, 'gene', 'position', NA, NA, 'strand', NA)) %>%
mutate(position = sub('range=', '', position),
strand = sub('strand=', '', strand)) %>%
separate(position, sep = ':', into = c('chr', 'tmp')) %>%
separate(tmp, sep = '-', into = c('start', 'end')) -> rmsk.txi.bed
rmsk.txi.bed %>%
select(-strand) %>%
merge(rmsk_parsed_bed, by = c('gene', 'chr', 'start', 'end')) -> filter_rmsk_parsed_bed
filter_rmsk_parsed_bed
}
intra.rmsk.txi_filter_parsed <- te_filter(intra.rmsk.txi)
exo.rmsk.txi_filter_parsed <- te_filter(exo.rmsk.txi)
hbec.rmsk.txi_filter_parsed <- te_filter(hbec.rmsk.txi)
Ranges Lists
if(build_ranges_lists == TRUE) {
tf_list_generator <- function(upreg_te_list,
filter_te_list) {
lapply(unique(upreg_te_list$gene), function(te) {
rmsk_parsed_bed %>%
filter(gene %in% upreg_te_list$gene,
gene == te) %>%
GenomicRanges::makeGRangesFromDataFrame() -> te_ranges
names(te_ranges) <- rep_len(te, length(te_ranges))
te_ranges
}) -> upreg_te_ranges_list
lapply(unique(filter_te_list$gene), function(te) {
filter_te_list %>%
filter(gene %in% upreg_te_list$gene,
gene == te) %>%
GenomicRanges::makeGRangesFromDataFrame() -> te_ranges
names(te_ranges) <- rep_len(te, length(te_ranges))
te_ranges
}) -> filtered_te_ranges_list
lapply(unique(filter_te_list$biotype), function(bt) {
rmsk_parsed_bed %>%
filter(gene %in% upreg_te_list$gene,
biotype == bt) %>%
GenomicRanges::makeGRangesFromDataFrame() -> te_ranges
names(te_ranges) <- rep_len(bt, length(te_ranges))
te_ranges
}) -> filtered_bt_ranges_list
return(list('upreg' = upreg_te_ranges_list,
'filter_gene' = filtered_te_ranges_list,
'filter_biotype' = filtered_bt_ranges_list))
}
intra.aale.te_ranges_list <- tf_list_generator(upreg_te$aale_intra, intra.rmsk.txi_filter_parsed)
exo.aale.te_ranges_list <- tf_list_generator(upreg_te$aale_exo, exo.rmsk.txi_filter_parsed)
hbec.aale.te_ranges_list <- tf_list_generator(upreg_te$hbec_intra, hbec.rmsk.txi_filter_parsed)
plot_te_bamSignal <- function(pos) {
width(te_ranges_list[[pos]])
median(width(te_ranges_list[[pos]]))
te_ranges_list[[pos]][width(te_ranges_list[[pos]]) > 10, ] -> te_ranges_tmp
resize(te_ranges_tmp, width = median(width(te_ranges_list[[pos]]))) -> te_ranges_tmp
bamsignals::bamCoverage(kras_bam_path,
te_ranges_tmp, paired.end = 'extend') -> kras_de_te_profile
bamsignals::bamCoverage(ctrl_bam_path,
te_ranges_tmp, paired.end = 'extend') -> ctrl_de_te_profile
bamsignals::alignSignals(kras_de_te_profile) %>%
as.data.frame() %>%
mutate(position = row_number()) %>%
gather(prom, cov, -position) %>%
mutate(cov = (cov/kras_norm_factor) * 1e6,
condition = 'kras',
context = 'de') -> kras_te_prom_df
bamsignals::alignSignals(ctrl_de_te_profile) %>%
as.data.frame() %>%
mutate(position = row_number()) %>%
gather(prom, cov, -position) %>%
mutate(cov = (cov/ctrl_norm_factor) * 1e6,
condition = 'ctrl',
context = 'de') -> ctrl_te_prom_df
rbind(kras_te_prom_df, ctrl_te_prom_df) %>%
ggplot(aes(position, cov, group = condition, color = condition)) +
stat_summary(geom="ribbon", fun.data=mean_cl_normal, width=0.1, conf.int=0.95,
fill='grey', alpha = 0.1, size = rel(0.05), show.legend = F)+
# stat_summary(geom="line", fun=mean, linetype="dashed", alpha = 0.15, size = rel(0.05)) +
stat_summary(geom="point", fun=mean, alpha = 1, size = rel(1)) +
scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
ylab('mean CPM (95% CI)') + xlab('position within insertion') +
theme(legend.position = c(0.99, 0.99))
}
general_ame_tf_generator <- function(te_ranges_list) {
lapply(te_ranges_list, function(te_ranges) {
if(length(te_ranges) > 0 & length(te_ranges) < 6000) {
names(te_ranges) %>% unique() -> name
print(name)
print(length(te_ranges))
getSeq(BSgenome.Hsapiens.UCSC.hg38, te_ranges) -> te_seqs
memes::runAme(te_seqs, control = 'shuffle') -> te_motif_enrich
if (!is.null(te_motif_enrich)) {
te_motif_enrich %>%
mutate(tmp = motif_id,
TE = name) %>%
separate(tmp, sep = '_', into = c('TF', NA)) %>%
mutate(TF = str_replace(TF, pattern = '^ZN', replacement = 'ZNF'),
TF = str_replace(TF, pattern = 'ZNFF', replacement = 'ZNF'),
TF = str_replace(TF, pattern = '^NFAC', replacement = 'NFATC')) -> ame_tf
ame_tf
}
}
})
}
intra.aale.te_ranges_list$general_ame_tf <- general_ame_tf_generator(intra.aale.te_ranges_list$filter_gene)
exo.aale.te_ranges_list$general_ame_tf <- general_ame_tf_generator(exo.aale.te_ranges_list$filter_gene)
hbec.aale.te_ranges_list$general_ame_tf <- general_ame_tf_generator(hbec.aale.te_ranges_list$filter_gene)
te_tf_plots <- function(ame_tf_list, upreg_te_list, de_list) {
ame_tf_list %>%
bind_rows() %>%
filter(adj.pvalue < 0.05) %>%
separate(motif_id, sep = '[.]', into = c('TF', NA, NA, NA)) %>%
mutate(TF = str_replace(TF, pattern = '_HUMAN', replacement = '')) %>%
merge(de_list, by.x = 'TF', by.y = 'gene') %>%
merge(meta.data$te.families.clades, by.x = 'TE', by.y = 'gene') %>%
merge(upreg_te_list, by.x = 'TE', by.y = 'gene') -> general_ame_tf_to_plt
general_ame_tf_to_plt %>%
filter(padj.x < 0.05) %>%
ggplot(aes(reorder(TE, -log2FoldChange.y), log2FoldChange.x)) +
geom_boxplot(outlier.colour = NA, fill = NA) +
geom_point() +
ggrepel::geom_text_repel(aes(label = ifelse(log2FoldChange.x < -4, TF, ''))) +
ylab('log2FoldChange') +
xlab('') +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) -> target_plot
general_ame_tf_to_plt %>%
select(TF, log2FoldChange.x, padj.x, TE, context) %>%
distinct() %>%
group_by(TF) %>%
summarise(log2FoldChange = median(log2FoldChange.x),
padj = median(padj.x),
TE = list(TE)) %>%
ggplot(aes(TE)) +
geom_bar() +
ggupset::scale_x_upset() +
ylab('shared TFs w/ present motifs') +
geom_text(stat='count', aes(label=after_stat(count)), vjust = -0.17, size = txt.mm_to_pts(0.8)) -> upset_plot
general_ame_tf_to_plt %>%
select(TF, log2FoldChange.x, padj.x, clade, context) %>%
distinct() %>%
group_by(TF) %>%
summarise(log2FoldChange = median(log2FoldChange.x),
padj = median(padj.x),
clade = list(clade)) %>%
mutate(gene_set = ifelse(TF %in% c(ifn_a, ifn_g), 'IFN', 'other')) %>%
mutate(gene_set = ifelse(TF %in% meta.data$trono.krab.znfs$gene, 'ZNF', gene_set)) %>%
mutate(gene_set = ifelse(TF %in% msig.df$HALLMARK_KRAS_SIGNALING_UP, 'KRAS', gene_set)) %>%
mutate(gene_set = factor(gene_set, levels = c('KRAS', 'IFN', 'ZNF', 'other'))) %>%
ggplot(aes(log2FoldChange, -log10(padj), color = gene_set)) +
geom_point(alpha = 0.4) +
scale_color_manual(values = c(viridis(4)[1], viridis(4)[2], viridis(4)[3], 'grey'), name = 'TF classification') +
ggrepel::geom_text_repel(aes(label = ifelse(gene_set != 'other' | TF == 'FOS' | abs(log2FoldChange) > 2, TF, '')),
show.legend = F,
max.overlaps = 20,
size = txt.mm_to_pts(0.8)) +
theme(legend.position = c(0.99,0.99)) -> volcano_plot
return(list('target_plt' = target_plot, 'upset_plt' = upset_plot, 'volcano_plt' = volcano_plot))
}
intra.aale.te_ranges_list$plts <- te_tf_plots(intra.aale.te_ranges_list$general_ame_tf, upreg_te$aale_intra, salmon.quant$aale$intra$deseq$de)
exo.aale.te_ranges_list$plts <- te_tf_plots(exo.aale.te_ranges_list$general_ame_tf, upreg_te$aale_exo, salmon.quant$aale$intra$deseq$de)
hbec.aale.te_ranges_list$plts <- te_tf_plots(hbec.aale.te_ranges_list$general_ame_tf, upreg_te$hbec_intra, salmon.quant$hbec$lacz_v_v12$deseq$de)
znf_list_generator <- function(te_ranges_list) {
te_ranges_list %>%
lapply(function(te_ranges) {
if(length(te_ranges) > 0 & length(te_ranges) < 6000) {
names(te_ranges) %>% unique() -> name
print(name)
print(length(te_ranges))
getSeq(BSgenome.Hsapiens.UCSC.hg38, te_ranges) -> te_seqs
memes::runAme(te_seqs, control = 'shuffle',
database = file.path(output.data.dir, 'reference.data',
'Hughes_ZNF_motifs.meme')) -> te_motif_enrich
if (!is.null(te_motif_enrich)) {
te_motif_enrich %>%
mutate(tmp = motif_id,
TE = name) %>%
separate(tmp, sep = '_', into = c('TF', NA)) -> ame_tf
ame_tf
}
}
})
}
intra.aale.te_ranges_list$znf_ame_tf <- znf_list_generator(intra.aale.te_ranges_list$filter_gene)
exo.aale.te_ranges_list$znf_ame_tf <- znf_list_generator(exo.aale.te_ranges_list$filter_gene)
hbec.aale.te_ranges_list$znf_ame_tf <- znf_list_generator(hbec.aale.te_ranges_list$filter_gene)
te_znf_plots <- function(ame_znf_list, upreg_te_list, de_list) {
ame_znf_list %>%
bind_rows() %>%
mutate(TF = str_split_fixed(TF, pattern = '[.]', n = 5)[,1]) %>%
filter(adj.pvalue < 0.05) %>%
merge(de_list, by.x = 'TF', by.y = 'gene') %>%
merge(meta.data$te.families.clades, by.x = 'TE', by.y = 'gene') %>%
merge(upreg_te_list, by.x = 'TE', by.y = 'gene') -> tmp
tmp %>%
filter(padj.x < 0.05) %>%
ggplot(aes(reorder(TE, -log2FoldChange.y), log2FoldChange.x, color = log2FoldChange.y)) +
# geom_boxplot(outlier.colour = NA, fill = NA, size = 0.5) +
geom_point() +
ggrepel::geom_text_repel(aes(label = ifelse(log2FoldChange.x < -2, TF, '')), size = txt.mm_to_pts(0.8), color = 'black') +
ylab('ZNF RNA log2FoldChange') +
xlab('putatively bound TE') +
scale_color_viridis_c(guide = guide_colorbar(title = 'TE log2FC')) +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
facet_row(~clade, scales = 'free_x', space = 'free') -> target_plot
tmp %>%
select(TF, log2FoldChange.x, padj.x, TE, context) %>%
distinct() %>%
group_by(TF) %>%
summarise(log2FoldChange = median(log2FoldChange.x),
padj = median(padj.x),
TE = list(TE)) %>%
ggplot(aes(TE)) +
geom_bar() +
ggupset::scale_x_upset() +
geom_text(stat='count', aes(label=after_stat(count)), vjust = -0.23, size = txt.mm_to_pts(0.8)) -> upset_plot
tmp %>%
select(TF, log2FoldChange.x, padj.x, clade, context) %>%
distinct() %>%
group_by(TF) %>%
summarise(log2FoldChange = median(log2FoldChange.x),
padj = median(padj.x),
clade = list(clade)) %>%
mutate(gene_set = ifelse(TF %in% c(ifn_a, ifn_g), 'IFN', 'other')) %>%
mutate(gene_set = ifelse(TF %in% meta.data$trono.krab.znfs$gene, 'ZNF', gene_set)) %>%
mutate(gene_set = ifelse(TF %in% msig.df$HALLMARK_KRAS_SIGNALING_UP, 'KRAS', gene_set)) %>%
mutate(gene_set = factor(gene_set, levels = c('KRAS', 'IFN', 'ZNF', 'other'))) %>%
ggplot(aes(log2FoldChange, -log10(padj), color = gene_set)) +
geom_point(alpha = 0.4) +
scale_color_manual(values = c(viridis(4)[1], viridis(4)[2], viridis(4)[3], 'grey'), name = 'diff. exp. TF classification') +
ggrepel::geom_text_repel(aes(label = ifelse(gene_set != 'other' & abs(log2FoldChange) < 1 | TF == 'FOS' | abs(log2FoldChange) > 3, TF, '')),
show.legend = F,
max.overlaps = 20, size = txt.mm_to_pts(0.8)) +
theme(legend.position = c(0.99,0.99)) -> volcano_plot
tmp %>%
filter(padj.x < 0.05) %>%
group_by(TE, clade) %>%
mutate(clade = factor(clade, levels = c('DNA', 'LINE', "LTR", 'Satellite', 'SINE'))) %>%
summarize(znf_log2FC = mean(log2FoldChange.x),
te_log2FC = mean(log2FoldChange.y)) %>%
ggplot(aes(te_log2FC, znf_log2FC, label = TE, color = clade)) +
geom_point() +
ggrepel::geom_text_repel(show.legend = F, size = txt.mm_to_pts(0.8)) +
geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.4) +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
scale_color_manual(values = te_alone_color_pal, name = 'TE Clade') +
ylab('TE-targeting ZNFs avg. RNA log2FC') +
xlab('putatively bound TE RNA log2FoldChange') +
theme(legend.position = c(0.99,0.99)) -> scatter_plot
return(list('de_list' = tmp, 'target_plt' = target_plot, 'upset_plt' = upset_plot, 'volcano_plt' = volcano_plot, 'scatter_plt' = scatter_plot))
}
intra.aale.te_ranges_list$znf_plts <- te_znf_plots(intra.aale.te_ranges_list$znf_ame_tf, upreg_te$aale_intra, salmon.quant$aale$intra$deseq$de)
exo.aale.te_ranges_list$znf_plts <- te_znf_plots(exo.aale.te_ranges_list$znf_ame_tf, upreg_te$aale_exo, salmon.quant$aale$intra$deseq$de)
hbec.aale.te_ranges_list$znf_plts <- te_znf_plots(hbec.aale.te_ranges_list$znf_ame_tf, upreg_te$hbec_intra, salmon.quant$hbec$lacz_v_v12$deseq$de)
znf_score <- read_csv(file = file.path(output.data.dir, 'all.znf.te.score.csv'))
znf_score %>%
dplyr::rename('gene' = '...1') -> znf_score
znf_score_plt <- function(gencode_de_list, te_de_list) {
znf_score %>%
gather(TE, score, -gene) %>%
group_by(gene) %>%
filter(score > 0) %>%
mutate(rank = dense_rank(-score)) %>%
filter(!grepl('Mamrep', TE)) %>%
separate(TE, sep = '\\/', into = c('clade', 'family', 'TE')) %>%
filter(gene %in% filter(gencode_de_list, padj < 0.05,
log2FoldChange < -0.75)$gene) -> tmp
tmp %>%
merge(te_de_list %>%
filter(padj < 0.05, log2FoldChange > 0.5), by.x = 'TE', by.y = 'gene') %>%
merge(gencode_de_list, by = 'gene') %>%
filter(rank < 50) %>%
ggplot(aes(TE, rank, color = log2FoldChange.y)) +
geom_point() +
scale_color_viridis_c(name = 'znf log2FC') +
ggrepel::geom_text_repel(aes(label = ifelse(rank < 12, gene, '')), size = txt.mm_to_pts(0.8), color = 'black') +
xlab('putatively bound TE') + ylab('binding score rank') +
theme(legend.position = c(0.99,0.99),
axis.text.x = element_text(angle = 40, hjust = 1)) +
facet_row(~clade, scales = 'free_x', space = 'free')
}
intra.aale.te_ranges_list$znf_plts$znf_score <- znf_score_plt(salmon.quant$aale$intra$deseq$de, ucsc.rmsk.salmon.quant$aale$intra$deseq$de)
exo.aale.te_ranges_list$znf_plts$znf_score <- znf_score_plt(salmon.quant$aale$intra$deseq$de, ucsc.rmsk.salmon.quant$aale$exo$deseq$de)
hbec.aale.te_ranges_list$znf_plts$znf_score <- znf_score_plt(salmon.quant$hbec$lacz_v_v12$deseq$de, ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq$de)
saveRDS(intra.aale.te_ranges_list, file.path(output.data.dir, 'intra.aale.te_ranges_list.rds'))
saveRDS(exo.aale.te_ranges_list, file.path(output.data.dir, 'exo.aale.te_ranges_list.rds'))
saveRDS(hbec.aale.te_ranges_list, file.path(output.data.dir, 'hbec.aale.te_ranges_list.rds'))
}
ATAC Coverage
library(bamsignals)
ctrl_norm_factor <- 43576060
kras_norm_factor <- 60276212
kras_bam_path <- file.path(output.data.dir, 'narrow_output.data/bwa/mergedLibrary/kras_R1.mLb.clN.sorted.bam')
ctrl_bam_path <- file.path(output.data.dir, 'narrow_output.data/bwa/mergedLibrary/ctrl_R1.mLb.clN.sorted.bam')
salmon.quant$aale$intra$deseq$de %>%
filter(padj < 0.05, log2FoldChange > 1.5, gene %in% c(ifn_g, ifn_a, jak_stat)) %>%
select(gene_id) %>% unlist() %>% unname() -> de_list
gr <- rowRanges(salmon.quant$gxi)
ifn_de_gr <- gr[names(gr) %in% de_list]
de_promoter_regions <- promoters(ifn_de_gr, upstream = 1000, downstream = 500)
bamsignals::bamCoverage(kras_bam_path, de_promoter_regions, paired.end = 'extend') -> kras_de_prom_profile
bamsignals::bamCoverage(ctrl_bam_path, de_promoter_regions, paired.end = 'extend') -> ctrl_de_prom_profile
bamsignals::alignSignals(kras_de_prom_profile) %>%
as.data.frame() %>%
mutate(position = row_number()) %>%
gather(prom, cov, -position) %>%
mutate(position = position - 1000,
cov = (cov/kras_norm_factor) * 1e6,
condition = 'kras',
context = 'de') -> kras_de_prom_df
bamsignals::alignSignals(ctrl_de_prom_profile) %>%
as.data.frame() %>%
mutate(position = row_number()) %>%
gather(prom, cov, -position) %>%
mutate(position = position - 1000,
cov = (cov/ctrl_norm_factor) * 1e6,
condition = 'ctrl',
context = 'de') -> ctrl_de_prom_df
gr <- rowRanges(salmon.quant$gxi)
salmon.quant$aale$intra$deseq$de %>%
filter(gene %in% meta.data$trono.krab.znfs$gene,
padj < 0.05, log2FoldChange <= -4.5) %>%
select(gene_id) %>% unlist() %>% unname() -> znf_de_list
znf_de_gr <- gr[names(gr) %in% znf_de_list]
de_promoter_regions <- promoters(znf_de_gr, upstream = 1000, downstream = 500)
bamsignals::bamCoverage(kras_bam_path, de_promoter_regions, paired.end = 'extend') -> kras_de_prom_profile
bamsignals::bamCoverage(ctrl_bam_path, de_promoter_regions, paired.end = 'extend') -> ctrl_de_prom_profile
bamsignals::alignSignals(kras_de_prom_profile) %>%
as.data.frame() %>%
mutate(position = row_number()) %>%
gather(prom, cov, -position) %>%
mutate(position = position - 1000,
cov = (cov/kras_norm_factor) * 1e6,
condition = 'kras',
context = 'de') -> znf_kras_de_prom_df
bamsignals::alignSignals(ctrl_de_prom_profile) %>%
as.data.frame() %>%
mutate(position = row_number()) %>%
gather(prom, cov, -position) %>%
mutate(position = position - 1000,
cov = (cov/ctrl_norm_factor) * 1e6,
condition = 'ctrl',
context = 'de') -> znf_ctrl_de_prom_df
de_promoter_regions_for_meme <- promoters(ifn_de_gr, upstream = 300, downstream = 500) %>%
memes::get_sequence(genome = BSgenome.Hsapiens.UCSC.hg38)
salmon.quant$aale$intra$deseq$de %>%
filter(padj > 0.5, gene %in% c(ifn_g, ifn_a)) %>%
select(gene_id) %>% unlist() %>% unname() -> ns_list
ns_gr <- gr[names(gr) %in% ns_list]
ns_promoter_regions <- promoters(ns_gr, upstream = 300, downstream = 500) %>%
memes::get_sequence(genome = BSgenome.Hsapiens.UCSC.hg38)
ifn_ame_out <- memes::runAme(de_promoter_regions_for_meme, control = ns_promoter_regions)
ifn_ame_out %>%
mutate(tmp = motif_id) %>%
separate(tmp, sep = '_', into = c('TF', NA)) %>%
mutate(TF = str_replace(TF, pattern = '^ZN', replacement = 'ZNF'),
TF = str_replace(TF, pattern = 'ZNFF', replacement = 'ZNF'),
TF = str_replace(TF, pattern = '^NFAC', replacement = 'NFATC')) -> ifn_ame_out
ATAC BW signal
library(trackplot)
library(scales)
bigWigs <- c(file.path(output.data.dir, 'atac_out/bwa/mergedLibrary/bigwig/kras_R1.mLb.clN.bigWig'),
file.path(output.data.dir, 'atac_out/bwa/mergedLibrary/bigwig/ctrl_R1.mLb.clN.bigWig'))
plot_big_wig_coverage <- function(loci, finish_plot = F, gene_name, y_lab = '', lab_tag = '') {
# loci <- paste0(chr, ':', start, '-', stop)
chr <- str_split(loci, pattern = c(':'))[[1]][1]
start <- as.numeric(str_replace_all(str_split(str_split(loci, pattern = c(':'))[[1]][2], c('-'))[[1]][1], ',' ,''))
stop <- as.numeric(str_replace_all(str_split(str_split(loci, pattern = c(':'))[[1]][2], c('-'))[[1]][2], ',' ,''))
track_data = track_extract(bigWigs = bigWigs, loci = loci, binsize = 1)
track_data = track_summarize(summary_list = track_data, condition = c("KRAS", "CTRL"), stat = "mean")
gene_model <- trackplot::.extract_geneModel_ucsc(chr = chr,
start = start,
end = stop,
refBuild = 'hg38')
max_vector <- c(as.data.frame(track_data$KRAS)$max, as.data.frame(track_data$CTRL)$max)
tx_tbl <- trackplot::.collapse_tx(gene_model)[[gene_name]]
region_start <- min(as.data.frame(track_data$KRAS)$start)
region_stop <- max(as.data.frame(track_data$KRAS)$end)
first_exon_start <- tx_tbl[[1]][1]
first_exon_stop <- tx_tbl[[2]][1]
exon_start_vector <- tx_tbl[[1]]
exon_end_vector <- tx_tbl[[2]]
first_gene_name <- attr(tx_tbl, 'gene')
start_stop_df <- data.frame(x = first_exon_stop,
start = exon_start_vector,
end = exon_end_vector,
y = 0.5)
gene_line_end <- ifelse(max(start_stop_df$end) >= region_stop, region_stop, max(start_stop_df$end))
if (attr(tx_tbl, 'strand') == '+') {
arrow_end = 'last'
gene_line_start <- ifelse(min(start_stop_df$start) >= region_start, min(start_stop_df$start), region_start)
} else {
arrow_end = 'first'
gene_line_start <- ifelse(min(start_stop_df$start) >= region_start, min(start_stop_df$start), region_start)
}
# break_step = (stop-start)/break_denom
track_data$KRAS %>%
as.data.frame() %>%
ggplot(aes(start, max)) +
scale_x_continuous(labels = comma) +
scale_y_continuous(limits = c(0, max(max_vector)),
expand = c(0,0), breaks = get_breaks(n = 4)) +
ylab(y_lab) +
theme(axis.line.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(face = 'bold'),
plot.margin=margin(b = 0,unit = "mm")) +
xlab('') +
ggtitle(first_gene_name) +
geom_area(fill = alpha(viridis(3)[2], 1), color = alpha(viridis(3)[2], 1)) -> A.plt
ggplot(data = start_stop_df,
aes(x = x,
xmin = start,
xend = end,
y = y,
yend = y)) +
scale_x_continuous(limits = c(region_start, region_stop)) +
scale_y_continuous(limits = c(0,1)) +
geom_segment(aes(x = gene_line_start, xend = gene_line_end),
color = 'darkblue', size = 0.75,
alpha = 0.1, arrow = arrow(ends = arrow_end)) +
geom_segment(aes(x = start, xmin = start, xend = end, y = y, yend = y),
color = 'darkblue', size = 12) +
ylab('') + xlab('') +
theme(axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.margin=margin(0,0,0,0,unit = "mm")) -> B.plt
track_data$CTRL %>%
as.data.frame() %>%
ggplot(aes(start, max)) +
scale_y_continuous(limits = c(0, max(max_vector)),
expand = c(0,0), breaks = get_breaks(n = 4)) +
scale_x_continuous(labels = comma, breaks = get_breaks(n = 3)) +
ylab('') + xlab(chr) +
geom_area(fill = alpha(viridis(3)[1], 1), color = alpha(viridis(3)[1], 1)) +
theme(axis.line.x = element_blank(),
plot.margin=margin(t = 0,unit = "mm")) -> C.plt
p4 <- ggplot(data.frame(l = A.plt$labels$y, x = 1, y = 1)) +
geom_text(aes(x, y, label = l), angle = 90) +
theme_void() +
coord_cartesian(clip = "off")
if (finish_plot == T) {
atac_layout <- "
DA
DB
DC
"
wrap_plots(A.plt + ylab(''),
B.plt,
C.plt + ylab(''),
p4,
design = atac_layout,
heights = c(0.85,0.15, 0.85),
widths = c(0.01, 1))
} else {
atac_layout <- "
A
B
C
"
wrap_plots(A.plt + labs(tag = lab_tag),
B.plt,
C.plt,
design = atac_layout,
heights = c(1,0.1, 1),
widths = c(1))
}
}
# , y_lab = 'norm.\nread depth'
plot_big_wig_coverage(loci = 'chr14:24,158,954-24,163,321', gene_name = 'IRF9') -> irf9_atac.plt
plot_big_wig_coverage(loci = 'chr11:613,326-618,086', gene_name = 'IRF7') -> irf7_atac.plt
plot_big_wig_coverage(loci = 'chr14:94,109,404-94,112,652', gene_name = 'IFI27') -> ifi27_atac.plt
plot_big_wig_coverage(loci = 'chr12:112,978,011-112,986,730', gene_name = 'OAS2') -> oas2_atac.plt
plot_big_wig_coverage(loci = 'chr1:78,649,502-78,650,799', gene_name = 'IFI44') -> ifi44_atac.plt
plot_big_wig_coverage(loci = 'chr21:41,418,539-41,427,685', gene_name = 'MX1') -> mx1_atac.plt
ifn_atac_layout <- "
GABC
GDEF
"
p4 <- ggplot(data.frame(l = 'scaled read depth', x = 1, y = 1)) +
geom_text(aes(x, y, label = l), angle = 90, size = txt.mm_to_pts(0.9)) +
theme_void() +
coord_cartesian(clip = "off")
wrap_plots(irf9_atac.plt,
irf7_atac.plt,
ifi27_atac.plt,
oas2_atac.plt,
ifi44_atac.plt,
mx1_atac.plt,
p4,
design = ifn_atac_layout,
widths = c(0.1, 1, 1, 1)) -> ifn_atac_6.plt
plot_big_wig_coverage(loci = 'chr19:20,074,561-20,081,789', gene_name = 'ZNF90') -> znf90_atac.plt
plot_big_wig_coverage(loci = 'chr19:20,418,910-20,432,132', gene_name = 'ZNF826P') -> znf826p_atac.plt
plot_big_wig_coverage(loci = 'chr7:64,297,656-64,316,528', gene_name = 'ZNF736') -> znf736_atac.plt
plot_big_wig_coverage(loci = 'chr19:56,497,894-56,520,251', gene_name = 'ZNF471') -> znf471_atac.plt
plot_big_wig_coverage(loci = 'chr19:20,030,435-20,054,396', gene_name = 'ZNF682') -> znf682_atac.plt
plot_big_wig_coverage(loci = 'chr7:6,613,987-6,618,813', gene_name = 'ZNF853') -> znf853_atac.plt
znf_atac_layout <- "
GABC
GDEF
"
wrap_plots(znf90_atac.plt,
znf826p_atac.plt,
znf736_atac.plt,
znf471_atac.plt,
znf682_atac.plt,
znf853_atac.plt,
p4,
design = znf_atac_layout,
widths = c(0.1, 1, 1, 1)) -> znf_atac_6.plt
znf
salmon.quant$aale$intra$deseq$de %>%
filter(padj < 0.05, log2FoldChange < -1, gene %in% meta.data$trono.krab.znfs$gene | grepl('^ZNF', gene)) %>%
select(gene_id) %>% unlist() %>% unname() -> znf_de_list
salmon.quant$aale$intra$deseq$de %>%
filter(padj < 0.05, log2FoldChange > 1.5) %>%
select(gene_id) %>% unlist() %>% unname() -> up_de_list
gr <- rowRanges(salmon.quant$gxi)
znf_de_gr <- gr[names(gr) %in% znf_de_list]
up_de_gr <- gr[names(gr) %in% up_de_list]
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == T) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05,
log2FoldChange < -1.5)$gene) %>%
filter(`Gene Name` %in% meta.data$trono.krab.znfs$gene | grepl('^ZNF', `Gene Name`)) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
# filter(chr != 'chr19') %>%
mutate(strand = '+') %>%
select(chr, start, end, strand, name = `Gene Name`) %>%
GenomicRanges::makeGRangesFromDataFrame() -> ctrl_znf_uniq_peak_ranges
consensus_atac_boolean_anno %>%
filter(kras_R1.mLb.clN.bool == T) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05,
log2FoldChange > 1.5)$gene) %>%
filter(`Gene Name` %in% c(ifn_g, ifn_a)) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
# filter(chr != 'chr19') %>%
mutate(strand = '+') %>%
select(chr, start, end, strand, name = `Gene Name`) %>%
GenomicRanges::makeGRangesFromDataFrame() -> kras_ifn_uniq_peak_ranges
znf_promoter_seqs <-
GenomicRanges::promoters(znf_de_gr, upstream = 1000, downstream = 500) %>%
memes::get_sequence(genome = BSgenome.Hsapiens.UCSC.hg38)
upDe_promoter_seqs <-
GenomicRanges::promoters(up_de_gr, upstream = 1000, downstream = 200) %>%
memes::get_sequence(genome = BSgenome.Hsapiens.UCSC.hg38)
znf_uniq_peaks_seqs <-
memes::get_sequence(regions = ctrl_znf_uniq_peak_ranges, genome = BSgenome.Hsapiens.UCSC.hg38)
ifn_uniq_peaks_seqs <-
memes::get_sequence(regions = kras_ifn_uniq_peak_ranges, genome = BSgenome.Hsapiens.UCSC.hg38)
znf_ame_out <- memes::runAme(znf_promoter_seqs,
control = upDe_promoter_seqs)
znf_ame_out %>%
mutate(tmp = motif_id) %>%
separate(tmp, sep = '_', into = c('TF', NA)) %>%
mutate(TF = str_replace(TF, pattern = '^ZN', replacement = 'ZNF'),
TF = str_replace(TF, pattern = 'ZNFF', replacement = 'ZNF'),
TF = str_replace(TF, pattern = '^NFAC', replacement = 'NFATC')) %>%
filter(adj.pvalue < 0.05) -> znf_ame_out
Figure 1
complex plotting
plot_complex_heatmap <- function(filter_values, show_genes = F, split_vector = NULL, split_title = NULL) {
set.seed(123)
## heatmap count matrix
working_toil_counts %>%
filter(gene %in% filter_values) %>%
group_by(gene) %>%
mutate(count = scale(log1p(count), center = T)) %>%
select(sample, gene, count) %>%
pivot_wider(names_from = gene, values_from = count) %>%
distinct() %>%
column_to_rownames('sample') %>%
t() -> toil_hm_counts
## heatmap mutation annotation
working_toil_counts %>%
select(sample, 'G12D' = KRAS) %>%
distinct() %>%
column_to_rownames('sample') %>%
ComplexHeatmap::columnAnnotation(df = . ,
col = list(G12D = c('WT_normal' = 'white',
'p.G12D' = viridis(3)[2])),
show_legend = F,
annotation_name_side = 'left',
annotation_name_gp = grid::gpar(fontsize = 8),
simple_anno_size = unit(2.5, "mm")) -> toil_sample_anno
## heatmap DE barpot annotation
working_toil_counts %>%
filter(gene %in% filter_values) %>%
select(gene) %>%
distinct() %>%
merge(salmon.quant$aale$intra$deseq$de %>% select(gene, log2FoldChange), by = 'gene') %>%
column_to_rownames('gene') -> toil_de
toil_de[order(match(rownames(toil_de), rownames(toil_hm_counts))), , drop =F] -> toil_de
toil_de_list <- setNames(toil_de$log2FoldChange, rownames(toil_de))
ComplexHeatmap::rowAnnotation(
`AALE RNA log2FC` = ComplexHeatmap::anno_barplot(x = toil_de_list,
bar_width = 1,
which = 'row',
gp = grid::gpar(col = "black", fill = viridis(3)[2]),
border = FALSE,
annotation_name_gp = grid::gpar(fontsize = 8),
width = unit(1.25, "cm"),
height = unit(0.75, "cm")),
show_annotation_name = TRUE,
annotation_name_gp = grid::gpar(fontsize = 8)) -> log2fc_ha
working_toil_counts %>%
filter(gene %in% filter_values) %>%
select(gene) %>%
distinct() %>%
mutate(IRF9 = ifelse(gene %in% irf9_fimo_results$gene, T, F),
IRF1 = ifelse(gene %in% irf1_fimo_results$gene, T, F),
IRF7 = ifelse(gene %in% irf7_fimo_results$gene, T, F)) %>%
column_to_rownames('gene') %>%
ComplexHeatmap::rowAnnotation(df =.,
col = list(IRF9 = c('TRUE' = viridis(2)[1], 'FALSE' = 'white'),
IRF1 = c('TRUE' = viridis(2)[1], 'FALSE' = 'white'),
IRF7 = c('TRUE' = viridis(2)[1], 'FALSE' = 'white')),
show_legend = F,
annotation_name_gp = grid::gpar(fontsize = 8),
simple_anno_size = unit(2, "mm")) -> gene_motif_anno
ComplexHeatmap::Heatmap(toil_hm_counts,
name = 'TCGA\nz-score',
show_row_names = show_genes,
show_column_names = F,
row_split = split_vector,
row_title = split_title,
row_title_gp = grid::gpar(fontsize = 7, face = 'bold'),
row_names_side = 'right',
row_dend_width = unit(3, 'mm'),
left_annotation = gene_motif_anno,
right_annotation = log2fc_ha,
top_annotation = toil_sample_anno,
heatmap_legend_param =
list(labels_gp = grid::gpar(fontsize = 7),
legend_gp = grid::gpar(fontsize = 7),
legend_direction = 'vertical',
legend_height = unit(3.5, "cm"),
title_gp = grid::gpar(fontsize = 7, face = 'bold')),
row_names_gp = grid::gpar(fontsize = 7),
heatmap_width = patch_fig_width*0.875)
}
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == F) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 1.5)$gene) %>%
# filter(`Gene Name` %in% c(ifn_g, ifn_a)) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
# group_by(annotation) %>%
group_by(`Gene Name`) %>%
tally() -> aale_uniq_up
salmon.quant$aale$intra$deseq$de %>%
filter(padj < 0.05, log2FoldChange > 1.5, gene %in% c(ifn_a, ifn_g)) -> aale_ifn
unique_peak_aale_ifn_list <- unique(c(aale_uniq_up$`Gene Name`, aale_ifn$gene))
aale_ifn_chm <- plot_complex_heatmap(unique_peak_aale_ifn_list, show_genes = T,
split_vector = 2, split_title = "cluster_%s")
aale_ifn_chm_no_rownames <- aale_ifn_chm
aale_ifn_chm_no_rownames@row_names_param$show <- FALSE
split_rowOrder <- ComplexHeatmap::row_order(ComplexHeatmap::draw(aale_ifn_chm_no_rownames))

aale_ifn_chm <- plot_complex_heatmap(unique_peak_aale_ifn_list, show_genes = T)
aale_ifn_chm_rowOrder <- ComplexHeatmap::row_order(ComplexHeatmap::draw(aale_ifn_chm))

ifn_high_hm_zoom <- aale_ifn_chm[split_rowOrder[[1]], ]
patchwork
msig.df[grepl('INTERFERON', names(msig.df))] %>%
unlist() %>% unname() -> ifn_gene_list
salmon.quant$aale$intra$deseq$de %>%
mutate(padj = ifelse(padj == 0,
min(filter(salmon.quant$aale$intra$deseq$de, padj != 0)$padj), padj)) %>%
mutate(gene_set = ifelse(gene %in% ifn_gene_list,'IFN', 'other'),
gene_set = ifelse(gene %in% msig.df$HALLMARK_KRAS_SIGNALING_UP | gene == 'KRAS', 'KRAS', gene_set),
gene_set = ifelse(gene %in% meta.data$trono.krab.znfs$gene, 'ZNF', gene_set)) %>%
mutate(gene_set = factor(gene_set, levels = c('IFN','KRAS','ZNF', 'other'))) %>%
filter(padj < 0.05, gene_set != 'other') %>%
mutate(kras = ifelse(gene == 'KRAS', T, F)) %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = kras)) +
geom_point(alpha = 0.6) +
ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 5 & gene_set != 'other' |
padj < 0.0001 & gene_set != 'other' | gene == "KRAS", gene, '')),
size = txt.mm_to_pts(0.8), segment.color = NA) +
facet_col(~gene_set, scales = 'free_y') +
xlab('RNA log2FoldChange') +
scale_color_manual(values = c('black', 'red'), guide = 'none') +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') -> plt_1A.plt
salmon.quant$aale$intra$deseq$H.fgsea %>%
arrange(NES) %>%
mutate(rank = row_number()) %>%
group_by(pathway) %>%
mutate(pathway = str_replace_all(pathway, 'HALLMARK ', ''),
pathway = str_replace_all(pathway, ' ', '\n'),
observed = length(unlist(leadingEdge)),
NES = round(NES, 3)) %>%
# arrange(-NES, padj) %>%
filter(!grepl('ZNF', pathway)) %>%
ggplot(aes(NES, rank, color = -log10(padj))) +
geom_point(size = 4) +
geom_segment(aes(x=0, xend=NES, y=rank, yend=rank)) +
geom_text(aes(label = paste0(observed, '/', size),
x = NES - 0.75 * sign(NES)),
nudge_y = 0.15,
color = 'black',
size = txt.mm_to_pts(0.8)) +
geom_text(aes(label = pathway,
x = - .9 * sign(NES)),
nudge_y = 0.05,
color = 'black',
size = txt.mm_to_pts(0.8)) +
scale_color_viridis_c() +
ylab('gene set') + xlab('GSEA NES') +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
theme(legend.position = c(0.99, 0.17),
axis.ticks.y = element_blank(),
axis.text.y = element_blank()) -> plt_1B.plt
salmon.quant$aale$intra$deseq$de %>%
dplyr::rename('AALE' = log2FoldChange) %>%
filter(padj < 0.05,
gene %in% c(ifn_a, ifn_g)) %>%
merge(salmon.quant$hbec$lacz_v_v12$deseq$de %>%
dplyr::rename('HBEC' = log2FoldChange) %>%
filter(padj < 0.05) %>%
mutate(HBEC = HBEC), by = 'gene') %>%
ggplot(aes(AALE, HBEC)) +
geom_point() +
xlab('AALE ISG RNA log2FC') + ylab('HBEC ISG RNA log2FC') +
geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.4) +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
geom_text_repel(aes(label = ifelse(AALE > 1 & HBEC > 1, gene, '')),
size = txt.mm_to_pts(0.8),show.legend = F) +
theme(legend.position = c(0.99, 0.22)) -> plt_1C.plt
salmon.quant$aale$intra$deseq$de %>%
merge(ame_tf, by.x = 'gene', by.y = 'TF') %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene)) +
geom_point() +
geom_text_repel(color = 'black', size = txt.mm_to_pts(0.8)) +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
xlab('ISG-binding TF RNA log2FC') +
theme(legend.position = c(0.35, 0.79)) -> plt_1D.plt
fig_1_patch <- "
AB
AB
CD
EE
"
wrap_plots(plt_1A.plt,
plt_1B.plt,
plt_1C.plt,
plt_1D.plt,
patchwork::wrap_elements(full =
grid::grid.grabExpr(ComplexHeatmap::draw(ifn_high_hm_zoom,
heatmap_legend_side = 'right'),
width = patch_fig_width)),
heights = c(1,1,0.75,1.25),
design = fig_1_patch) +
plot_annotation(tag_levels = 'A') &
theme(plot.tag = element_text(size = rel(0.95), face = 'bold'),
plot.tag.position = c(0.01, 0.99)) -> figure_1_patchwork.fig
ggsave2(plot = figure_1_patchwork.fig,
filename = file.path(figure.dir, 'fig.1', 'figure_1_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_1_patchwork.fig

Figure S1
salmon.quant$aale$intra$deseq$de %>%
mutate(padj = ifelse(padj == 0,
min(filter(salmon.quant$aale$intra$deseq$de, padj != 0)$padj), padj)) %>%
mutate(biotype = ifelse(gene %in% meta.data$gencode.35.lnc.gene.names$gene, 'lncRNA', 'protein-coding')) %>%
mutate(biotype = factor(biotype, levels = c('protein-coding','lncRNA'))) %>%
filter(padj < 0.05) %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene)) +
geom_point(alpha = 0.6) +
ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 5 |
-log10(padj) > 250, gene, '')),
size = txt.mm_to_pts(0.8), segment.color = NA, force = 10, num.overlaps = 5) +
facet_col(~biotype, scales = 'free') +
xlab('RNA log2FoldChange') -> plt_S1A.plt
salmon.quant$aale$intra$deseq$pca$pca.out %>%
rownames_to_column('sample') %>%
ggplot(aes(PC1, PC2,
fill = condition,
color = condition)) +
geom_point(size = rel(2)) +
scale_size_continuous(range = c(0.1, 2.5), breaks = c(0,5,10,15,20)) +
xlab(paste('AALE in. PC1',
round(cumsum(salmon.quant$aale$intra$deseq$pca$pcs.props)[1], digits = 3), sep = ' ')) +
ylab(paste('AALE in. PC2', round(cumsum(salmon.quant$aale$intra$deseq$pca$pcs.props)[2] -
cumsum(salmon.quant$aale$intra$deseq$pca$pcs.props)[1],
digits = 3), sep = ' ')) +
scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
geom_vline(xintercept = 0, linetype = 'dotted', alpha = 0.3, size = 0.25) +
geom_hline(yintercept = 0, linetype = 'dotted', alpha = 0.3, size = 0.25) +
theme(legend.position = c(0.30, 0.45),
legend.key.width = unit(0.125,'cm'),
legend.key.height = unit(0.4, 'cm')) -> plt_S1B.plt
salmon.quant$hbec$lacz_v_v12$deseq$de %>%
mutate(padj = ifelse(padj == 0,
min(filter(salmon.quant$hbec$lacz_v_v12$deseq$de, padj != 0)$padj), padj)) %>%
mutate(gene_set = ifelse(gene %in% ifn_gene_list,'HBEC-IFN', 'other'),
gene_set = ifelse(gene %in% msig.df$HALLMARK_KRAS_SIGNALING_UP | gene == 'KRAS', 'HBEC-KRAS', gene_set),
gene_set = ifelse(gene %in% meta.data$trono.krab.znfs$gene, 'HBEC-ZNF', gene_set)) %>%
mutate(gene_set = factor(gene_set, levels = c('HBEC-IFN','HBEC-KRAS','HBEC-ZNF', 'other'))) %>%
filter(padj < 0.05, gene_set != 'other') %>%
mutate(kras = ifelse(gene == 'KRAS', T, F)) %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = kras)) +
geom_point(alpha = 0.6, shape = 24) +
ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 5 & gene_set != 'other' |
padj < 0.0001 & gene_set != 'other' | gene == "KRAS", gene, '')),
size = txt.mm_to_pts(0.8), segment.color = NA) +
facet_col(~gene_set, scales = 'free_y') +
scale_color_manual(values = c('black', 'red'), guide = 'none') +
xlab('RNA log2FoldChange') +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') -> plt_S1C.plt
fig_S1_patch <- "
AC
AC
BC
EE
"
wrap_plots(plt_S1A.plt,
plt_S1B.plt,
plt_S1C.plt,
patchwork::wrap_elements(full =
grid::grid.grabExpr(ComplexHeatmap::draw(aale_ifn_chm_no_rownames,
heatmap_legend_side = 'right'),
width = patch_fig_width)),
design = fig_S1_patch,
heights = c(0.75,0.75,0.75, 1.75)) +
plot_annotation(tag_levels = 'A') &
theme(plot.tag = element_text(size = rel(0.95), face = 'bold'),
plot.tag.position = c(0.01, 0.99)) -> figure_S1_patchwork.fig
ggsave(plot = figure_S1_patchwork.fig,
filename = file.path(figure.dir, 'fig.1', 'figure_S1_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_S1_patchwork.fig

Figure 2
rbind(kras_de_prom_df, ctrl_de_prom_df) %>%
mutate(condition = toupper(condition)) %>%
ggplot(aes(position, cov, group = condition, color = condition)) +
stat_summary(geom="ribbon", fun.data=mean_cl_boot, width=0.1, conf.int=0.95,
fill='grey', alpha = 0.1, size = rel(0.05), show.legend = F)+
stat_summary(geom="point", fun=mean, alpha = 1, size = rel(1), show.legend = T) +
scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
xlab('distance to TSS') +
ylab('mean CPM (95% CI)') +
xlim(-1000, 500) +
theme(legend.position = c(0.3, 0.99)) -> plt_2A.plt
salmon.quant$aale$intra$deseq$de %>%
merge(uniq_kras_peak_genes, by.x = 'gene', by.y = 'Gene Name') %>%
mutate(`unique peaks` = ifelse(gene %in% c(ifn_a, ifn_g), 'ISG', 'other')) %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = `unique peaks`)) +
geom_point() +
scale_color_manual(values = c('black', 'grey'), name = 'unique ATAC peak') +
geom_text_repel(aes(label = ifelse(`unique peaks` == 'ISG' | log2FoldChange > 6, gene, '')),
size = txt.mm_to_pts(0.8), show.legend = F, segment.color = NA) +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
xlab('RNA log2FoldChange')+
theme(legend.position = c(0.40, 1)) -> plt_2B.plt
base::do.call(rbind, uniq_kras_great_tb) %>%
group_by(Ontology) %>%
top_n(30, -HyperFdrQ) %>%
filter(HyperFdrQ < 0.05,
NumFgGenesHit > 1,
grepl('interferon|virus|RNA|oligo|^ex', Desc),
!grepl('part|poly', Desc)) %>%
ggplot(aes(log2(RegionFoldEnrich), reorder(Desc, RegionFoldEnrich), label = Desc, color = -log10(HyperFdrQ))) +
geom_point(aes(size = NumFgGenesHit)) +
geom_text_repel(aes(label = Desc),
size = txt.mm_to_pts(0.8), show.legend = F, point.padding = unit(2, 'mm'), color = 'black') +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(size = rel(0.7), face = 'plain'),
legend.position = c(0.99,0.35), legend.direction = 'horizontal') +
scale_x_continuous() +
ylab('GO Term') + xlab('ATAC region GREAT log2(FoldEnrichment)') +
scale_color_viridis_c() -> plt_2D.plt
fig_2_patch <- "
AB
CC
DD
"
wrap_plots(plt_2A.plt,
plt_2B.plt,
wrap_elements(full = ifn_atac_6.plt),
plt_2D.plt,
heights = c(0.75,1.5,0.75),
design = fig_2_patch) +
plot_annotation(tag_levels = 'A') -> figure_2_patchwork.fig
ggsave(plot = figure_2_patchwork.fig,
filename = file.path(figure.dir, 'fig.2', 'figure_2_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_2_patchwork.fig

Figure 3
kras.sight <- read_csv(file.path(output.data.dir, 'kras.nanosight.data.csv'), skip = 2,
col_names = c('size', 'r1', 'r2', 'r3', NA, NA, NA, NA)) %>%
mutate(condition = 'kras')
ctrl.sight <- read_csv(file.path(output.data.dir, 'ctrl.nanosight.data.csv'), skip = 2,
col_names = c('size', 'r1', 'r2', 'r3', NA, NA, NA, NA)) %>%
mutate(condition = 'ctrl')
nano_sight <- rbind(kras.sight, ctrl.sight)
nano_sight %>%
select(size, r1, r2, r3, condition) %>%
gather(rep, value, -condition, -size) %>%
filter(size <= 400) %>%
mutate(value = value/1e6,
condition = paste0(toupper(condition), ' EV')) %>%
group_by(size, condition) %>%
summarize(y = mean(value),
ymin = mean(value) - sd(value),
ymax = mean(value) + sd(value)) %>%
ggplot(aes(size, y = y, ymin = ymin, ymax = ymax, fill = condition)) +
geom_ribbon(alpha = 0.25) +
geom_line(color = 'darkgrey') +
scale_fill_manual(values = c(viridis(3)[1], viridis(3)[2]), guide = F) +
stat_peaks(geom = 'text', span = 35, vjust = -0.5, y.label.fmt = '%#f') +
facet_row(~condition) +
coord_cartesian(clip = 'off') +
ylab('1e6 particles per mL') + xlab('nm') -> plt_3A.plt
salmon.quant$aale$exo$deseq$de %>%
dplyr::rename('EXO_aale' = log2FoldChange) %>%
filter(padj < 0.05) %>%
merge(salmon.quant$aale$intra$deseq$de %>%
dplyr::rename('INTRA_aale' = log2FoldChange) %>%
filter(padj < 0.05), by = 'gene') %>%
lm(EXO_aale ~ INTRA_aale, data = .) %>%
summary() -> exo_lm_out
b <- round(exo_lm_out$coefficients[1,1], 2)
m <- round(exo_lm_out$coefficients[2,1], 2)
r2 <- round(exo_lm_out$adj.r.squared, 2)
salmon.quant$aale$exo$deseq$de %>%
dplyr::rename('EXO_aale' = log2FoldChange) %>%
filter(padj < 0.05) %>%
merge(salmon.quant$aale$intra$deseq$de %>%
dplyr::rename('INTRA_aale' = log2FoldChange) %>%
filter(padj < 0.05), by = 'gene') %>%
mutate(unique = ifelse(gene %in% uniq_kras_peak_genes$`Gene Name`, 'KRAS',
ifelse(gene %in% uniq_ctrl_peak_genes$`Gene Name`, 'CTRL', 'none'))) %>%
ggplot(aes(EXO_aale, INTRA_aale, color = unique)) +
geom_point() +
scale_color_manual(values = c(viridis(3)[1], viridis(3)[2], 'grey'), name = 'unique ATAC peak') +
geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.4) +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
geom_text_repel(aes(label = ifelse(abs(EXO_aale) > 1 & abs(INTRA_aale) > 1 | unique != 'none', gene, '')),
size = txt.mm_to_pts(0.8), show.legend = F, segment.color = NA) +
theme(legend.position = c(0.99, 0.23)) +
geom_line(stat = 'smooth', method = 'lm', linetype = 'dashed', alpha = 0.8, color = 'purple') +
annotate(geom = 'text', x = 2.35, y = -2, fontface = 'italic',
label = paste0('y', ' ', ' = ',b,' + ',m,'x; \nadj. R2 = ', r2),
size = rel(3.75), alpha = 0.8) +
ylab('intracellular RNA log2FC') + xlab('extracellular RNA log2FC') -> plt_3B.plt
salmon.quant$aale$exo$deseq$de %>%
mutate(biotype = ifelse(gene %in% meta.data$gencode.35.lnc.gene.names$gene, 'lncRNA', 'coding')) %>%
# filter(padj < 0.05) %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = biotype)) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c('black', 'grey')) +
ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 1, gene, '')), size = txt.mm_to_pts(0.8),
show.legend = F) +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
xlab('ex. RNA log2FoldChange') +
theme(legend.position = c(0.25,0.99)) -> plt_3C.plt
salmon.quant$aale$intra$deseq$de %>%
filter(padj < 0.05, abs(log2FoldChange) > 0.5) %>%
mutate(context = ifelse(log2FoldChange > 0, 'in_up', 'in_dn')) %>%
select(gene, context) %>%
rbind(
salmon.quant$aale$exo$deseq$de %>%
filter(padj < 0.05, abs(log2FoldChange) > 0.5) %>%
mutate(context = ifelse(log2FoldChange > 0, 'ex_up', 'ex_dn')) %>%
select(gene, context)
) %>%
group_by(gene) %>%
summarize(contexts = list(context)) %>%
ggplot(aes(contexts)) +
geom_bar() +
ggupset::scale_x_upset() +
geom_text(stat='count', aes(label=after_stat(count)), vjust = -0.05, size = txt.mm_to_pts(0.8)) +
xlab('') + ylab('overlapping DE genes') -> plt_3D.plt
salmon.quant$aale$exo$deseq$H.fgsea %>%
filter(pathway %in% salmon.quant$aale$intra$deseq$H.fgsea$pathway) %>%
mutate(context = 'ex.') %>%
rbind(salmon.quant$aale$intra$deseq$H.fgsea %>%
filter(pathway %in% salmon.quant$aale$exo$deseq$H.fgse$pathway) %>%
mutate(context = 'in.')) %>%
arrange(NES) %>%
mutate(rank = row_number()) %>%
group_by(pathway) %>%
mutate(pathway = str_replace_all(pathway, 'HALLMARK ', ''),
observed = length(unlist(leadingEdge)),
NES = round(NES, 3)) %>%
filter(!grepl('ZNF', pathway)) %>%
ggplot(aes(NES, context, color = -log10(padj))) +
geom_point(size = 4) +
geom_segment(aes(x=0, xend=NES, y=context, yend=context)) +
scale_color_viridis_c() +
ylab('context') + xlab('GSEA NES') +
ggforce::facet_col(~pathway, scales = 'free_y') +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
theme(legend.position = c(0.25, 0.99),
legend.direction = 'vertical') -> plt_3E.plt
ucsc.rmsk.salmon.quant$aale$exo$deseq$de %>%
filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
padj < 0.05) %>%
merge(meta.data$te.families.clades, by = 'gene') %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = clade)) +
geom_point(alpha = 0.6) +
# xlim(-2, 2) +
scale_color_viridis_d(direction = -1) +
ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) >= 1.15, gene, '')), show.legend = F) +
theme(legend.position = c(0.99,0.95)) +
xlab('ex. TE RNA log2FoldChange')-> plt_3F.plt
fig_3_patch <- "
AB
CD
EF
"
wrap_plots(plt_3A.plt,
plt_3C.plt,
plt_3B.plt,
wrap_elements(full = plt_3D.plt),
plt_3E.plt,
plt_3F.plt,
design = fig_3_patch,
heights = c(1, 1, 1)) +
plot_annotation(tag_levels = 'A') -> figure_3_patchwork.fig
ggsave(plot = figure_3_patchwork.fig,
filename = file.path(figure.dir, 'fig.3', 'figure_3_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_3_patchwork.fig

Figure S2
salmon.quant$aale$exo$deseq$pca$pca.out %>%
rownames_to_column('sample') %>%
mutate(condition = toupper(condition)) %>%
ggplot(aes(PC1, PC2,
fill = condition,
color = condition)) +
geom_point(size = rel(2)) +
scale_size_continuous(range = c(0.1, 2.5), breaks = c(0,5,10,15,20)) +
xlab(paste('AALE ex. PC1', round(cumsum(salmon.quant$aale$exo$deseq$pca$pcs.props)[1], digits = 3), sep = ' ')) +
ylab(paste('AALE ex. PC2', round(cumsum(salmon.quant$aale$exo$deseq$pca$pcs.props)[2] -
cumsum(salmon.quant$aale$exo$deseq$pca$pcs.props)[1],
digits = 3), sep = ' ')) +
scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
geom_vline(xintercept = 0, linetype = 'dotted', alpha = 0.3, size = 0.25) +
geom_hline(yintercept = 0, linetype = 'dotted', alpha = 0.3, size = 0.25) +
theme(legend.position = c(0.99, 0.30),
legend.key.width = unit(0.125,'cm'),
legend.key.height = unit(0.4, 'cm')) -> plt_S3A.plt
ucsc.rmsk.salmon.quant$aale$exo$deseq$counts %>%
filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene) %>%
merge(meta.data$te.families.clades, by = 'gene') %>%
filter(clade %in% c('DNA', "LINE",'SINE', 'LTR', 'Satellite')) %>%
mutate(clade = factor(clade, levels = c('DNA', 'LINE', 'LTR', 'Satellite', 'SINE'))) %>%
gather(sample, count, -gene, -clade, -family) %>%
separate(sample, sep = '[.]', into = c(NA, NA, NA, 'condition')) %>%
ggplot(aes(condition, log1p(count))) +
geom_boxplot(outlier.colour = NA, fill = NA) +
geom_jitter(alpha = 0.1, color = 'black', width = 0.15) +
facet_col(~clade, scales = 'free_y') +
# scale_y_continuous(limits = c(0,13)) +
stat_compare_means(method = 'wilcox',
label = 'p.format',
# label.y = 2,
# label.x = 0.5,
method.args = list(alternative = 'greater'),
ref.group = 'ctrl') +
theme(strip.placement = 'inside') +
ylab('log(norm. count + 1)') +
coord_cartesian(clip = 'off')-> plt_S3B.plt
ucsc.rmsk.salmon.quant$aale$exo$deseq$counts %>%
# filter(gene %in% meta.data$gencode.35.tx.to.gene$gene) %>%
merge(meta.data$te.families.clades, by = 'gene', all.x = T) %>%
mutate(clade = ifelse(is.na(clade), 'coding', clade),
clade = ifelse(gene %in% meta.data$gencode.35.lnc.gene.names$gene, 'lncRNA', clade)) %>%
gather(sample, count, -family, -clade, -gene) %>%
filter(!grepl('^MT|^RP', gene)) %>%
mutate(condition = ifelse(grepl('ctrl', sample), 'ctrl', 'kras'),
sample = str_replace(sample, 'rnaEASY.aale.', '')) %>%
group_by(sample, condition, clade) %>%
summarize(clade_counts = sum(count)) %>%
group_by(sample) %>%
mutate(frac = clade_counts/sum(clade_counts)) %>%
mutate(clade = factor(clade, levels = c('DNA', "LINE",'LTR', 'Satellite', 'SINE', 'lncRNA', 'coding'))) %>%
filter(clade %in% c('DNA', "LINE",'SINE', 'LTR', 'Satellite', 'coding', 'lncRNA')) %>%
ggplot(aes(sample, frac, fill = clade)) +
geom_col(color = 'white') +
# geom_hline(yintercept = seq(0,1,0.25), color = 'white', alpha = 0.6) +
scale_fill_viridis_d(direction = -1) +
facet_col(~condition, scales = 'free_x') +
xlab('') +
ylab('fraction of norm. counts') +
ggtitle('extracellular') +
theme(legend.position = 'top', legend.justification = 'center',
axis.text.x = element_blank(), axis.ticks.x = element_blank()) -> plt_S3C.plt
ucsc.rmsk.salmon.quant$aale$intra$deseq$counts %>%
# filter(gene %in% meta.data$gencode.35.tx.to.gene$gene) %>%
merge(meta.data$te.families.clades, by = 'gene', all.x = T) %>%
mutate(clade = ifelse(is.na(clade), 'coding', clade),
clade = ifelse(gene %in% meta.data$gencode.35.lnc.gene.names$gene, 'lncRNA', clade)) %>%
gather(sample, count, -family, -clade, -gene) %>%
filter(!grepl('^MT|^RP', gene)) %>%
mutate(condition = ifelse(grepl('ctrl', sample), 'ctrl', 'kras'),
sample = str_replace(sample, 'rnaEASY.aale.', '')) %>%
group_by(sample, condition, clade) %>%
summarize(clade_counts = sum(count)) %>%
group_by(sample) %>%
mutate(frac = clade_counts/sum(clade_counts)) %>%
mutate(clade = factor(clade, levels = c('DNA', "LINE",'LTR', 'Satellite', 'SINE', 'lncRNA', 'coding'))) %>%
filter(clade %in% c('DNA', "LINE",'SINE', 'LTR', 'Satellite', 'coding', 'lncRNA')) %>%
ggplot(aes(sample, frac, fill = clade)) +
geom_col(color = 'white') +
# geom_hline(yintercept = seq(0,1,0.25), color = 'white', alpha = 0.6) +
scale_fill_viridis_d(direction = -1) +
facet_col(~condition, scales = 'free_x') +
xlab('') +
ylab('fraction of norm. counts') +
ggtitle('intracellular') +
theme(legend.position = 'top', legend.justification = 'center',
axis.text.x = element_blank(), axis.ticks.x = element_blank()) -> plt_S3D.plt
# exo_biotype.df %>%
# mutate(context = 'extracellular') %>%
# rbind(intra_biotype.df %>%
# mutate(context = 'intracellular')) %>%
# ggplot(aes(context, frac, color = condition)) +
# geom_boxplot(outlier.colour = NA, fill = NA) +
# # geom_jitter(alpha = 0.6, width = 1) +
# facet_wrap(~clade, scales = 'free_y', ncol = 2) +
# xlab('') + ylab('fraction of norm. counts') +
# # stat_compare_means(method = 'wilcox', comparisons = list(c('intracellular', 'extracellular'))) +
# scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
# theme(legend.position = c(0.99, 0.15)) -> plt_S3B.plt
fig_S3_patch <- "
AB
AB
CD
"
wrap_plots(plt_S3A.plt,
plt_S3B.plt,
plt_S3D.plt,
plt_S3C.plt,
design = fig_S3_patch) +
plot_annotation(tag_levels = 'A') -> figure_S3_patchwork.fig
ggsave(plot = figure_S3_patchwork.fig,
filename = file.path(figure.dir, 'fig.2', 'figure_S2_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_S3_patchwork.fig

Figure S3
ucsc.rmsk.salmon.quant$aale$intra$deseq$de %>%
filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
padj < 0.05) %>%
mutate(context = 'AALE') %>%
rbind(ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq$de %>%
filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
padj < 0.05) %>%
mutate(context = 'HBEC')) %>%
merge(meta.data$te.families.clades, by = 'gene') %>%
mutate(clade = factor(clade, levels = c('DNA', 'LINE', "LTR", 'Satellite', 'SINE'))) %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = clade)) +
geom_point(alpha = 0.6) +
ggrepel::geom_text_repel(aes(label = ifelse(log2FoldChange > 0, gene, '')),
show.legend = F, max.overlaps = 30, size = txt.mm_to_pts(0.8)) +
scale_color_viridis_d(direction = -1, name = 'TE Clade') +
facet_col(~context, scales = 'free_y') +
xlab('RNA log2FoldChange') +
xlim(-6,4) +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
theme(legend.position = c(0.35,0.35)) -> plt_4A.plt
salmon.quant$aale$intra$deseq$de %>%
dplyr::rename('AALE' = log2FoldChange) %>%
filter(padj < 0.05,
gene %in% meta.data$trono.krab.znfs$gene) %>%
merge(salmon.quant$hbec$lacz_v_v12$deseq$de %>%
dplyr::rename('HBEC' = log2FoldChange) %>%
filter(padj < 0.05) %>%
mutate(HBEC = HBEC), by = 'gene') %>%
merge(meta.data$trono.krab.znfs, by = 'gene') %>%
ggplot(aes(AALE, HBEC, color = ZNF)) +
geom_point() +
scale_color_viridis_d(name = 'ZNF type') +
xlab('AALE log2FoldChange') + ylab('HBEC log2FoldChange') +
geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.4) +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
scale_x_continuous(limits = c(-3,3)) +
scale_y_continuous(limits = c(-2,2)) +
geom_text_repel(aes(label = ifelse(AALE < -1 & HBEC < -0.35, gene, '')), show.legend = F, size = txt.mm_to_pts(0.8), color = 'black') +
theme(legend.position = c(0.99, 0.99)) -> plt_4B.plt
plt_4D_patch <- "
AAA
BCD
EEE
"
p4 <- ggplot(data.frame(l = 'putatively bound TE RNA log2FoldChange', x = 1, y = 1)) +
geom_text(aes(x, y, label = l), angle = 0, size = txt.mm_to_pts(0.8)) +
theme_void() +
coord_cartesian(clip = "off")
wrap_elements(full = wrap_plots(guide_area(),
intra.aale.te_ranges_list$znf_plts$scatter_plt + theme(legend.direction = 'horizontal') + ggtitle('AALE in.') + xlab(''),
exo.aale.te_ranges_list$znf_plts$scatter_plt + theme(legend.direction = 'horizontal') + ggtitle('AALE ex.') + ylab('') + xlab(''),
hbec.aale.te_ranges_list$znf_plts$scatter_plt + theme(legend.direction = 'horizontal') + ggtitle('HBEC in.') + ylab('') + xlab(''),
p4,
guides = 'collect', heights = c(0.05, 1, 0.005), design = plt_4D_patch)) -> plt_4D.plt
fig_4_patch <- "
AB
CD
EE
"
wrap_plots(plt_4A.plt,
plt_4B.plt,
intra.aale.te_ranges_list$znf_plts$target_plt + theme(legend.position = c(0.3,0.3)) + ggtitle('ZNF motif binding'),
intra.aale.te_ranges_list$znf_plts$znf_score + theme(legend.position = c(0.99,0.99)) + ggtitle('ZNF binding score rank'),
plt_4D.plt,
design = fig_4_patch) +
plot_annotation(tag_levels = 'A') -> figure_4_patchwork.fig
ggsave(plot = figure_4_patchwork.fig,
filename = file.path(figure.dir, 'fig.3', 'figure_S3_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_4_patchwork.fig

Figure S4
intra.aale.te_ranges_list$znf_plts$de_list %>%
select(TF, log2FoldChange.x, padj.x, clade, context) %>%
distinct() %>%
group_by(TF, clade) %>%
summarise(log2FoldChange = median(log2FoldChange.x),
padj = median(padj.x)) %>%
ggplot(aes(log2FoldChange, -log10(padj))) +
geom_point(alpha = 0.4) +
ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 1, TF, '')),
show.legend = F,
max.overlaps = 20, size = txt.mm_to_pts(0.8)) +
theme(legend.position = 'none') +
facet_col(~clade) +
xlab('RNA log2FoldChange') +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
ggtitle('AALE in. DE ZNFs targeting TE clades') -> plt_S4A.plt
exo.aale.te_ranges_list$znf_plts$de_list %>%
select(TF, log2FoldChange.x, padj.x, clade, context) %>%
distinct() %>%
group_by(TF, clade) %>%
summarise(log2FoldChange = median(log2FoldChange.x),
padj = median(padj.x)) %>%
ggplot(aes(log2FoldChange, -log10(padj))) +
geom_point(alpha = 0.4) +
ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 2.5, TF, '')),
show.legend = F,
max.overlaps = 20, size = txt.mm_to_pts(0.8)) +
theme(legend.position = 'none') +
facet_col(~clade) +
xlab('RNA log2FoldChange') +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
ggtitle('AALE ex. DE ZNFs targeting TE clades') -> plt_S4B.plt
hbec.aale.te_ranges_list$znf_plts$de_list %>%
select(TF, log2FoldChange.x, padj.x, clade, context) %>%
distinct() %>%
group_by(TF, clade) %>%
summarise(log2FoldChange = median(log2FoldChange.x),
padj = median(padj.x)) %>%
ggplot(aes(log2FoldChange, -log10(padj))) +
geom_point(alpha = 0.4) +
ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 1, TF, '')),
show.legend = F,
max.overlaps = 20, size = txt.mm_to_pts(0.8)) +
theme(legend.position = 'none') +
facet_col(~clade) +
xlab('RNA log2FoldChange') +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
ggtitle('HBEC in. DE ZNFs targeting TE clades') -> plt_S4C.plt
fig_S4_patch <- "
AD
BD
CE
"
wrap_plots(plt_S4A.plt,
plt_S4B.plt,
plt_S4C.plt,
wrap_elements(full = intra.aale.te_ranges_list$plts$upset_plt),
plot_spacer(),
design = fig_S4_patch) +
plot_annotation(tag_levels = 'A') -> figure_S4_patchwork.fig
ggsave2(plot = figure_S4_patchwork.fig,
filename = file.path(figure.dir, 'fig.4', 'figure_S4_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_S4_patchwork.fig

Figure 4
rbind(znf_kras_de_prom_df, znf_ctrl_de_prom_df) %>%
ggplot(aes(position, cov, group = condition, color = condition)) +
stat_summary(geom="ribbon", fun.data=mean_cl_boot, width=0.1, conf.int=0.95,
fill='grey', alpha = 0.1, size = rel(0.05), show.legend = F)+
stat_summary(geom="point", fun=mean, alpha = 1, size = rel(1)) +
scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
xlab('distance to TSS') +
ylab('mean CPM (95% CI)') +
xlim(-1000, 500) +
theme(legend.position = c(0.30, 0.99)) -> plt_6A.plt
salmon.quant$aale$intra$deseq$de %>%
merge(uniq_ctrl_peak_genes, by.x = 'gene', by.y = 'Gene Name') %>%
mutate(`unique peaks` = ifelse(gene %in% meta.data$trono.krab.znfs$gene | grepl('ZNF', gene), 'ZNF', 'other')) %>%
ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = `unique peaks`)) +
geom_point() +
scale_color_manual(values = c('grey', 'black'), name = 'unique ATAC peak') +
geom_text_repel(aes(label = ifelse(`unique peaks` == 'ZNF' | abs(log2FoldChange) > 4, gene, '')),
size = txt.mm_to_pts(0.8), show.legend = F, max.overlaps = 30, segment.color = NA) +
geom_vline(xintercept = c(0), alpha = c(0.3), linetype = 'dashed') +
xlab('RNA log2FoldChange') +
theme(legend.position = c(0.40, 0.99)) -> plt_6B.plt
salmon.quant$aale$intra$deseq$de %>%
merge(znf_ame_out, by.x = 'gene', by.y = 'TF') %>%
mutate(gene_set = ifelse(gene %in% ifn_gene_list,'IFN', 'other'),
gene_set = ifelse(gene %in% msig.df$HALLMARK_KRAS_SIGNALING_UP, 'KRAS', gene_set),
gene_set = ifelse(gene %in% meta.data$trono.krab.znfs$gene | grepl('^ZNF', gene), 'ZNF', gene_set)) %>%
mutate(gene_set = factor(gene_set, levels = c('KRAS','IFN','ZNF', 'other'))) %>%
ggplot(aes(log2FoldChange, -log10(padj), color = gene_set, label = gene)) +
geom_point() +
scale_color_viridis_d(name = 'TF classification') +
xlab('DE TF RNA log2FoldChange') +
xlim(-2.5,2.5)+
geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 0, gene, '')), color = 'black') +
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
theme(legend.position = 'top') -> plt_6D.plt
fig_6_patch <- "
AB
CC
DD
"
wrap_plots(plt_6A.plt,
plt_6B.plt,
wrap_elements(full = znf_atac_6.plt),
plt_6D.plt,
# plt_6E.plt,
heights = c(0.75,1.5,0.75),
design = fig_6_patch) +
plot_annotation(tag_levels = 'A') -> figure_6_patchwork.fig
ggsave(plot = figure_6_patchwork.fig,
filename = file.path(figure.dir, 'fig.4', 'figure_4_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_6_patchwork.fig

Figure 5
complex plotting
plot_complex_heatmap <- function(filter_values, show_genes = F, split_vector = NULL, split_title = NULL) {
set.seed(123)
## heatmap count matrix
working_toil_counts %>%
filter(gene %in% filter_values) %>%
group_by(gene) %>%
mutate(count = scale(log1p(count), center = T)) %>%
select(sample, gene, count) %>%
pivot_wider(names_from = gene, values_from = count) %>%
distinct() %>%
column_to_rownames('sample') %>%
t() -> toil_hm_counts
## heatmap mutation annotation
working_toil_counts %>%
select(sample, 'G12D' = KRAS) %>%
distinct() %>%
column_to_rownames('sample') %>%
ComplexHeatmap::columnAnnotation(df = . ,
col = list(G12D = c('WT_normal' = 'white',
'p.G12D' = viridis(3)[2])),
show_legend = F,
annotation_name_side = 'right',
annotation_name_gp = grid::gpar(fontsize = 8),
simple_anno_size = unit(2.5, "mm")) -> toil_sample_anno
## heatmap DE barpot annotation
working_toil_counts %>%
filter(gene %in% filter_values) %>%
select(gene) %>%
distinct() %>%
merge(salmon.quant$aale$intra$deseq$de %>% select(gene, log2FoldChange), by = 'gene') %>%
column_to_rownames('gene') -> toil_de
toil_de[order(match(rownames(toil_de), rownames(toil_hm_counts))), , drop =F] -> toil_de
toil_de_list <- setNames(toil_de$log2FoldChange, rownames(toil_de))
ComplexHeatmap::rowAnnotation(
`AALE RNA log2FC` = ComplexHeatmap::anno_barplot(x = toil_de_list,
bar_width = 1,
which = 'row',
gp = grid::gpar(col = "black", fill = viridis(3)[2]),
border = FALSE,
annotation_name_gp = grid::gpar(fontsize = 8),
width = unit(1.25, "cm"),
height = unit(0.75, "cm")),
show_annotation_name = TRUE,
annotation_name_gp = grid::gpar(fontsize = 8)) -> log2fc_ha
working_toil_counts %>%
filter(gene %in% filter_values) %>%
select(gene) %>%
distinct() %>%
mutate(uniq_peak = ifelse(gene %in% znf_unique_peaks$name, T, F)) %>%
column_to_rownames('gene') %>%
ComplexHeatmap::rowAnnotation(df =.,
col = list(uniq_peak = c('TRUE' = viridis(2)[1], 'FALSE' = 'white')),
show_legend = F, annotation_name_side = 'top',
annotation_name_gp = grid::gpar(fontsize = 8),
simple_anno_size = unit(2, "mm")) -> gene_motif_anno
ComplexHeatmap::Heatmap(toil_hm_counts,
name = 'TCGA\nz-score',
show_row_names = show_genes,
show_column_names = F,
row_split = split_vector,
row_title = split_title,
row_title_gp = grid::gpar(fontsize = 7, face = 'bold'),
row_names_side = 'right',
row_dend_width = unit(3, 'mm'),
left_annotation = gene_motif_anno,
right_annotation = log2fc_ha,
top_annotation = toil_sample_anno,
heatmap_legend_param =
list(labels_gp = grid::gpar(fontsize = 7),
legend_gp = grid::gpar(fontsize = 7),
legend_direction = 'vertical',
legend_height = unit(3.5, "cm"),
title_gp = grid::gpar(fontsize = 7, face = 'bold')),
row_names_gp = grid::gpar(fontsize = 7),
heatmap_width = patch_fig_width*0.875)
}
consensus_atac_boolean_anno %>%
filter(ctrl_R1.mLb.clN.bool == T) %>%
filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05,
log2FoldChange < -1.5)$gene) %>%
filter(`Gene Name` %in% meta.data$trono.krab.znfs$gene | grepl('^ZNF', `Gene Name`)) %>%
separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>%
filter(annotation == "promoter-TSS ") %>%
mutate(strand = '+') %>%
select(chr, start, end, strand, name = `Gene Name`) -> znf_unique_peaks
salmon.quant$aale$intra$deseq$de %>%
filter(padj < 0.05, log2FoldChange < -1, gene %in% meta.data$trono.krab.znfs$gene | grepl('ZNF', gene)) -> aale_znf
aale_znf_chm <- plot_complex_heatmap(aale_znf$gene, show_genes = T, split_vector = NULL, split_title = NULL)
working_toil_counts %>%
filter(gene %in% c(aale_znf$gene)) %>%
group_by(gene) %>%
mutate(count = scale(log1p(count), center = T)) %>%
select(sample, count, gene, KRAS) %>%
group_by(gene) %>%
summarize(w = wilcox.test(count ~ KRAS)$p.value) %>%
filter(w < 0.05) %>%
pull(gene) -> toil_sig_znf
working_toil_counts %>%
filter(gene %in% toil_sig_znf) %>%
group_by(gene) %>%
mutate(count = scale(log1p(count), center = T)) %>%
rename('KRAS' = 'G12D') %>%
ggplot(aes(gene, count, color = G12D)) +
geom_boxplot(fill = NA, outlier.colour = NA) +
geom_point(alpha = 0.1, position = position_jitterdodge(jitter.width = 0.25)) +
ylab('z-score') + xlab('') +
scale_color_manual(values = c(viridis(3)[2], viridis(3)[1]), name = 'KRAS') +
theme(axis.text.x = element_text(angle = 40, hjust = 1),
legend.position = c(0.99, 0.25)) -> plt_7B.plt
znf.avg.counts.tidy.stratified <-
norm.counts.final %>%
filter(sample_type == 'Primary Tumor') %>%
filter(gene %in% toil_sig_znf) %>%
select(sample, count) %>%
group_by(sample) %>%
summarize(avg.count = mean(count)) %>%
mutate(gene.set = 'aale_znf_signal') %>%
group_by(gene.set) %>%
mutate(stratum = ifelse(avg.count >= quantile(avg.count, 0.66),
'top-third', ifelse(avg.count <= quantile(avg.count, 0.33),
'bottom-third', 'middle'))) %>%
filter(stratum != 'middle') %>%
spread(gene.set, stratum)
surv.obj <- Surv(time = filter(luad.phenotypes.merge %>% select(sample, OS.time, OS) %>% distinct() ,
sample %in% znf.avg.counts.tidy.stratified$sample)$OS.time,
event = filter(luad.phenotypes.merge %>% select(sample, OS.time, OS) %>% distinct(),
sample %in% znf.avg.counts.tidy.stratified$sample)$OS)
surv.fit <- survfit(as.formula(surv.obj ~ znf.avg.counts.tidy.stratified$aale_znf_signal),
data = znf.avg.counts.tidy.stratified)
ggsurvplot(surv.fit,
pval = T,
legend.title = "",
ggtheme = theme_set_fun() + theme(legend.position = c(0.45,0.95)),
pval.coord = c(0, 0.08),
palette = viridis(3)[c(2,1)]) -> surv.plt
model.plt <- magick::image_read_pdf(path = file.path(figure.dir, 'fig.5','aale_model_cropped.pdf'))
fig_7_patch <- "
AA
BB
CD
"
wrap_plots(patchwork::wrap_elements(full =
grid::grid.grabExpr(ComplexHeatmap::draw(aale_znf_chm,
heatmap_legend_side = 'right'),
width = patch_fig_width)),
plt_7B.plt,
wrap_elements(full = surv.plt$plot),
wrap_elements(full = magick::image_ggplot(model.plt)),
heights = c(1.5,0.75,0.75),
design = fig_7_patch) +
plot_annotation(tag_levels = 'A') -> figure_7_patchwork.fig
ggsave(plot = figure_7_patchwork.fig,
filename = file.path(figure.dir, 'fig.5', 'figure_5_patchwork.pdf'),
width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_7_patchwork.fig
